Search in sources :

Example 16 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testBranchTopologicalSort.

@Test
public void testBranchTopologicalSort() {
    // read----------------------write
    // \                      /
    // ----parse---normalize
    ReadOperation read = new ReadOperation("read", "read descr", EndPoint.of("ns", "input"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse descr", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address");
    List<InputField> writeInputs = new ArrayList<>();
    writeInputs.add(InputField.of("read", "offset"));
    writeInputs.add(InputField.of("parse", "name"));
    writeInputs.add(InputField.of("normalize", "address"));
    WriteOperation write = new WriteOperation("write", "write descr", EndPoint.of("ns", "output"), writeInputs);
    Set<Operation> operations = new LinkedHashSet<>();
    operations.add(read);
    operations.add(parse);
    operations.add(normalize);
    operations.add(write);
    List<Operation> topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    // try with different insertion orders
    operations = new LinkedHashSet<>();
    operations.add(parse);
    operations.add(normalize);
    operations.add(write);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    operations = new LinkedHashSet<>();
    operations.add(write);
    operations.add(normalize);
    operations.add(parse);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, parse);
    assertBefore(topologicallySortedOperations, parse, normalize);
    assertBefore(topologicallySortedOperations, normalize, write);
    assertBefore(topologicallySortedOperations, read, write);
    // When the field lineage is queried for offset field, we will only return the
    // read and write operations, since parse and normalize operations are not affecting
    // the offset field in anyway. In this case even though write operation has input with origin
    // as normalize, topological sort should not affect by this case, where normalize operation
    // itself is missing.
    operations = new LinkedHashSet<>();
    operations.add(write);
    operations.add(read);
    topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations);
    assertBefore(topologicallySortedOperations, read, write);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) InputField(io.cdap.cdap.api.lineage.field.InputField) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 17 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testMultiPathFieldLineage.

@Test
public void testMultiPathFieldLineage() {
    // read1: file1 -> (offset, body)
    // read2: file2 -> (offset, body)
    // merge: (read1.offset, read1.body, read2.offset, read2.body) -> (offset, body)
    // parse: (merge.body) -> (name,address)
    // write: (parse.name, parse.address, merge.offset) -> file
    EndPoint read1EndPoint = EndPoint.of("ns1", "file1");
    EndPoint read2EndPoint = EndPoint.of("ns2", "file2");
    EndPoint fileEndPoint = EndPoint.of("ns3", "file");
    ReadOperation read1 = new ReadOperation("read1", "Reading from file1", read1EndPoint, "offset", "body");
    ReadOperation read2 = new ReadOperation("read2", "Reading from file2", read2EndPoint, "offset", "body");
    TransformOperation merge = new TransformOperation("merge", "merging fields", Arrays.asList(InputField.of("read1", "offset"), InputField.of("read2", "offset"), InputField.of("read1", "body"), InputField.of("read2", "body")), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("merge", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "writing to another file", fileEndPoint, Arrays.asList(InputField.of("merge", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(parse);
    operations.add(merge);
    operations.add(read1);
    operations.add(read2);
    operations.add(write);
    FieldLineageInfo fllInfo = new FieldLineageInfo(operations);
    Map<EndPoint, Set<String>> destinationFields = fllInfo.getDestinationFields();
    Assert.assertEquals(1, destinationFields.size());
    Assert.assertEquals(new HashSet<>(Arrays.asList("name", "address", "offset")), destinationFields.get(fileEndPoint));
    Map<EndPointField, Set<EndPointField>> incomingSummary = fllInfo.getIncomingSummary();
    Assert.assertEquals(3, incomingSummary.size());
    Set<EndPointField> expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(read1EndPoint, "body"));
    expectedSet.add(new EndPointField(read1EndPoint, "offset"));
    expectedSet.add(new EndPointField(read2EndPoint, "body"));
    expectedSet.add(new EndPointField(read2EndPoint, "offset"));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "name")));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "address")));
    Assert.assertEquals(expectedSet, incomingSummary.get(new EndPointField(fileEndPoint, "offset")));
    Map<EndPointField, Set<EndPointField>> outgoingSummary = fllInfo.getOutgoingSummary();
    Assert.assertEquals(4, outgoingSummary.size());
    expectedSet = new HashSet<>();
    expectedSet.add(new EndPointField(fileEndPoint, "offset"));
    expectedSet.add(new EndPointField(fileEndPoint, "name"));
    expectedSet.add(new EndPointField(fileEndPoint, "address"));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read1EndPoint, "offset")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read1EndPoint, "body")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read2EndPoint, "offset")));
    Assert.assertEquals(expectedSet, outgoingSummary.get(new EndPointField(read2EndPoint, "body")));
    // test outgoing operations of all source endoints
    Set<Operation> outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read1EndPoint, "offset"));
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(read1);
    expectedOperations.add(merge);
    expectedOperations.add(parse);
    expectedOperations.add(write);
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read1EndPoint, "body"));
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read2EndPoint, "offset"));
    expectedOperations = new HashSet<>();
    expectedOperations.add(read2);
    expectedOperations.add(merge);
    expectedOperations.add(parse);
    expectedOperations.add(write);
    Assert.assertEquals(expectedOperations, outgoingOperations);
    outgoingOperations = fllInfo.getOutgoingOperationsForField(new EndPointField(read2EndPoint, "body"));
    Assert.assertEquals(expectedOperations, outgoingOperations);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) Test(org.junit.Test)

Example 18 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testDirectReadWrite.

@Test
public void testDirectReadWrite() {
    List<Operation> operations = new ArrayList<>();
    ReadOperation read = new ReadOperation("read", "", EndPoint.of("ns1", "endpoint1"), "id", "name");
    ReadOperation anotherRead = new ReadOperation("anotherRead", "", EndPoint.of("ns1", "endpoint2"), "id1", "name1");
    WriteOperation write = new WriteOperation("write", "", EndPoint.of("ns1", "endpoint3"), InputField.of("read", "id"), InputField.of("read", "name"), InputField.of("anotherRead", "id1"), InputField.of("anotherRead", "name1"));
    operations.add(read);
    operations.add(write);
    operations.add(anotherRead);
    FieldLineageInfo info = new FieldLineageInfo(operations);
    Map<EndPointField, Set<EndPointField>> incoming = info.getIncomingSummary();
    Map<EndPointField, Set<EndPointField>> expected = ImmutableMap.of(new EndPointField(EndPoint.of("ns1", "endpoint3"), "id"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint1"), "id")), new EndPointField(EndPoint.of("ns1", "endpoint3"), "id1"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint2"), "id1")), new EndPointField(EndPoint.of("ns1", "endpoint3"), "name"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint1"), "name")), new EndPointField(EndPoint.of("ns1", "endpoint3"), "name1"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint2"), "name1")));
    Assert.assertEquals(expected, incoming);
    Map<EndPointField, Set<EndPointField>> outgoing = info.getOutgoingSummary();
    expected = ImmutableMap.of(new EndPointField(EndPoint.of("ns1", "endpoint1"), "id"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint3"), "id")), new EndPointField(EndPoint.of("ns1", "endpoint2"), "id1"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint3"), "id1")), new EndPointField(EndPoint.of("ns1", "endpoint1"), "name"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint3"), "name")), new EndPointField(EndPoint.of("ns1", "endpoint2"), "name1"), Collections.singleton(new EndPointField(EndPoint.of("ns1", "endpoint3"), "name1")));
    Assert.assertEquals(expected, outgoing);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) Test(org.junit.Test)

Example 19 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageInfoTest method testValidOperations.

@Test
public void testValidOperations() {
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body")));
    List<Operation> operations = new ArrayList<>();
    operations.add(read);
    operations.add(write);
    operations.add(parse);
    FieldLineageInfo info1 = new FieldLineageInfo(operations);
    // Serializing and deserializing should result in the same checksum.
    String operationsJson = GSON.toJson(info1.getOperations());
    Type setType = new TypeToken<Set<Operation>>() {
    }.getType();
    Set<Operation> operationsFromJson = GSON.fromJson(operationsJson, setType);
    FieldLineageInfo info2 = new FieldLineageInfo(operationsFromJson);
    Assert.assertEquals(info1, info2);
    // Create lineage info with different ordering of same operations. Checksum should still be same.
    operations.clear();
    operations.add(write);
    operations.add(parse);
    operations.add(read);
    FieldLineageInfo info3 = new FieldLineageInfo(operations);
    Assert.assertEquals(info1, info3);
    // Change the namespace name of the write operation from ns to myns. The checksum should change now.
    operations.clear();
    WriteOperation anotherWrite = new WriteOperation("write", "write data", EndPoint.of("myns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body")));
    operations.add(anotherWrite);
    operations.add(parse);
    operations.add(read);
    FieldLineageInfo info4 = new FieldLineageInfo(operations);
    Assert.assertNotEquals(info1, info4);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) Type(java.lang.reflect.Type) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Test(org.junit.Test)

Example 20 with WriteOperation

use of io.cdap.cdap.api.lineage.field.WriteOperation in project cdap by caskdata.

the class FieldLineageTableTest method testMergeSummaries.

@Test
public void testMergeSummaries() {
    RunId runId = RunIds.generate(10000);
    ProgramId program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
    final ProgramRunId programRun1 = program.run(runId.getId());
    runId = RunIds.generate(11000);
    program = new ProgramId("default", "app1", ProgramType.WORKFLOW, "workflow1");
    final ProgramRunId programRun2 = program.run(runId.getId());
    List<Operation> operations = new ArrayList<>();
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns1", "endpoint1"), "offset", "body");
    WriteOperation write = new WriteOperation("write", "some write", EndPoint.of("ns", "endpoint3"), InputField.of("read", "body"));
    operations.add(read);
    operations.add(write);
    final FieldLineageInfo info1 = new FieldLineageInfo(operations);
    ReadOperation anotherRead = new ReadOperation("anotherRead", "another read", EndPoint.of("ns1", "endpoint2"), "offset", "body");
    WriteOperation anotherWrite = new WriteOperation("anotherWrite", "another write", EndPoint.of("ns", "endpoint3"), InputField.of("anotherRead", "body"));
    operations.add(anotherRead);
    operations.add(anotherWrite);
    final FieldLineageInfo info2 = new FieldLineageInfo(operations);
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        fieldLineageTable.addFieldLineageInfo(programRun1, info1);
        fieldLineageTable.addFieldLineageInfo(programRun2, info2);
    });
    TransactionRunners.run(transactionRunner, context -> {
        FieldLineageTable fieldLineageTable = FieldLineageTable.create(context);
        EndPoint source1 = EndPoint.of("ns1", "endpoint1");
        EndPoint source2 = EndPoint.of("ns1", "endpoint2");
        EndPoint destination = EndPoint.of("ns", "endpoint3");
        Set<EndPointField> expected = new HashSet<>();
        expected.add(new EndPointField(source1, "body"));
        expected.add(new EndPointField(source2, "body"));
        Set<EndPointField> actualEndPointFields = fieldLineageTable.getIncomingSummary(new EndPointField(destination, "body"), 0, 11001);
        Assert.assertEquals(expected, actualEndPointFields);
    });
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ProgramId(io.cdap.cdap.proto.id.ProgramId) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)45 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)45 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)44 Operation (io.cdap.cdap.api.lineage.field.Operation)42 HashSet (java.util.HashSet)33 Test (org.junit.Test)33 ArrayList (java.util.ArrayList)32 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)25 HashMap (java.util.HashMap)19 LinkedHashSet (java.util.LinkedHashSet)15 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)14 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)14 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)14 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)13 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)13 Connection (io.cdap.cdap.etl.proto.Connection)13 Set (java.util.Set)13 InputField (io.cdap.cdap.api.lineage.field.InputField)11 ImmutableSet (com.google.common.collect.ImmutableSet)10