Search in sources :

Example 26 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageAdminTest method summary.

private Set<EndPointField> summary() {
    Set<EndPointField> endPointFields = new HashSet<>();
    EndPoint endPoint1 = EndPoint.of("ns", "file");
    EndPoint endPoint2 = EndPoint.of("ns", "anotherfile");
    endPointFields.add(new EndPointField(endPoint1, "a"));
    endPointFields.add(new EndPointField(endPoint1, "b"));
    endPointFields.add(new EndPointField(endPoint1, "c"));
    endPointFields.add(new EndPointField(endPoint2, "x"));
    endPointFields.add(new EndPointField(endPoint2, "y"));
    endPointFields.add(new EndPointField(endPoint2, "z"));
    return endPointFields;
}
Also used : EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) HashSet(java.util.HashSet)

Example 27 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageAdminTest method testOperations.

@Test
public void testOperations() {
    FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(Collections.emptySet(), Collections.emptySet(), operations()), metadataAdmin);
    EndPoint endPoint = EndPoint.of("ns", "file");
    // input args to the getOperationDetails below does not matter since data returned is mocked
    FieldLineageDetails operationDetails = fieldLineageAdmin.getOperationDetails(Constants.FieldLineage.Direction.INCOMING, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE);
    ProgramId program1 = new ProgramId("ns", "app", ProgramType.SPARK, "sparkprogram");
    ProgramId program2 = new ProgramId("ns", "app", ProgramType.MAPREDUCE, "mrprogram");
    ProgramRunId program1Run1 = program1.run(RunIds.generate(1000));
    ProgramRunId program1Run2 = program1.run(RunIds.generate(2000));
    ProgramRunId program1Run3 = program1.run(RunIds.generate(3000));
    ProgramRunId program1Run4 = program1.run(RunIds.generate(5000));
    ProgramRunId program2Run1 = program2.run(RunIds.generate(4000));
    ProgramRunId program2Run2 = program2.run(RunIds.generate(6000));
    List<ProgramFieldOperationInfo> incomings = operationDetails.getIncoming();
    Set<ProgramFieldOperationInfo> expectedInfos = new HashSet<>();
    List<ProgramInfo> programInfos = new ArrayList<>();
    // program1Run1 and program1Run2 both generated same set of operations, however only the latest
    // run will be included in the returned list. None of the run of program2 generated these set of operations.
    programInfos.add(new ProgramInfo(program1, RunIds.getTime(program1Run2.getRun(), TimeUnit.SECONDS)));
    EndPoint endPoint1 = EndPoint.of("ns", "file");
    EndPoint endPoint2 = EndPoint.of("ns", "anotherfile");
    List<FieldOperationInfo> fieldOperationInfos = new ArrayList<>();
    // Return list should have topologically sorted operations
    fieldOperationInfos.add(new FieldOperationInfo("read", "reading file", FieldOperationInput.of(endPoint1), FieldOperationOutput.of(Arrays.asList("offset", "body"))));
    List<InputField> inputFields = new ArrayList<>();
    inputFields.add(InputField.of("read", "offset"));
    inputFields.add(InputField.of("parse", "name"));
    inputFields.add(InputField.of("parse", "address"));
    inputFields.add(InputField.of("parse", "zip"));
    fieldOperationInfos.add(new FieldOperationInfo("write", "writing file", FieldOperationInput.of(inputFields), FieldOperationOutput.of(endPoint2)));
    expectedInfos.add(new ProgramFieldOperationInfo(programInfos, fieldOperationInfos));
    programInfos = new ArrayList<>();
    // program1 and program2 both generated the next set of operations, returned list will contain the
    // only one latest run of each program and that too sorted by the last execution time.
    programInfos.add(new ProgramInfo(program2, RunIds.getTime(program2Run2.getRun(), TimeUnit.SECONDS)));
    programInfos.add(new ProgramInfo(program1, RunIds.getTime(program1Run4.getRun(), TimeUnit.SECONDS)));
    fieldOperationInfos = new ArrayList<>();
    fieldOperationInfos.add(new FieldOperationInfo("read", "reading file", FieldOperationInput.of(endPoint1), FieldOperationOutput.of(Arrays.asList("offset", "body"))));
    FieldOperationInput input = FieldOperationInput.of(Collections.singletonList(InputField.of("read", "offset")));
    FieldOperationOutput output = FieldOperationOutput.of(Collections.singletonList("offset"));
    fieldOperationInfos.add(new FieldOperationInfo("normalize", "normalizing offset", input, output));
    inputFields = new ArrayList<>();
    inputFields.add(InputField.of("normalize", "offset"));
    inputFields.add(InputField.of("parse", "name"));
    inputFields.add(InputField.of("parse", "address"));
    inputFields.add(InputField.of("parse", "zip"));
    input = FieldOperationInput.of(inputFields);
    output = FieldOperationOutput.of(endPoint2);
    fieldOperationInfos.add(new FieldOperationInfo("write", "writing file", input, output));
    expectedInfos.add(new ProgramFieldOperationInfo(programInfos, fieldOperationInfos));
    Assert.assertNotNull(incomings);
    // converting to set because ordering in different versions of operations is not guaranteed
    Assert.assertEquals(expectedInfos, new HashSet<>(incomings));
}
Also used : InputField(io.cdap.cdap.api.lineage.field.InputField) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) ProgramFieldOperationInfo(io.cdap.cdap.proto.metadata.lineage.ProgramFieldOperationInfo) ArrayList(java.util.ArrayList) FieldOperationOutput(io.cdap.cdap.proto.metadata.lineage.FieldOperationOutput) FieldOperationInput(io.cdap.cdap.proto.metadata.lineage.FieldOperationInput) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) FieldLineageDetails(io.cdap.cdap.proto.metadata.lineage.FieldLineageDetails) ProgramId(io.cdap.cdap.proto.id.ProgramId) ProgramInfo(io.cdap.cdap.proto.metadata.lineage.ProgramInfo) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) FieldOperationInfo(io.cdap.cdap.proto.metadata.lineage.FieldOperationInfo) ProgramFieldOperationInfo(io.cdap.cdap.proto.metadata.lineage.ProgramFieldOperationInfo) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 28 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class FieldLineageAdminTest method testFieldsWithDsSchema.

@Test
public void testFieldsWithDsSchema() throws Exception {
    FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(getFieldNames(), Collections.emptySet(), Collections.emptySet()), metadataAdmin);
    EndPoint endPoint = EndPoint.of(NamespaceId.DEFAULT.getNamespace(), "file");
    // test that when there is no schema information present for the dataset and the we request for lineage with
    // includeCurrent set to true we get lineage fields correctly.
    Set<Field> expected = getFields(getFieldNames());
    // includeCurrent set to true
    Set<Field> actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, true);
    Assert.assertEquals(expected, actual);
    // schema with fields which are different than known to lineage store
    Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("address", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("addiffField1", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("diffField2", Schema.nullableOf(Schema.of(Schema.Type.INT))));
    // add the the dataset with the schema with fields known in lineage store
    TableProperties.Builder props = TableProperties.builder();
    TableProperties.setSchema(props, schema);
    TableProperties.setRowFieldName(props, "name");
    DatasetId datasetId = NamespaceId.DEFAULT.dataset("file");
    MetadataEntity entity = datasetId.toMetadataEntity();
    datasetFramework.addInstance("table", datasetId, props.build());
    // wait until the metadata for this dataset has been stored
    Tasks.waitFor(false, () -> metadataAdmin.getProperties(MetadataScope.SYSTEM, entity).isEmpty(), 5, TimeUnit.SECONDS);
    // test all fields expected should have all the fields which was known the lineage store but should not contains
    // any dataset schema field since the includeCurrent is set to false
    expected = getFields(getFieldNames());
    actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, false);
    Assert.assertEquals(expected, actual);
    // test all fields expected should have all the fields which was known the lineage store and also the fields
    // which were only present in the dataset schema since includeCurrent is set to true.
    // this also test that for the fields which are common in lineage store and dataset schema for example address in
    // this case has their lineage info field set to true as we do have lineage for this field
    expected = getFields(getFieldNames());
    expected.addAll(new HashSet<>(Arrays.asList(new Field("addiffField1", false), new Field("diffField2", false))));
    actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, true);
    Assert.assertEquals(expected, actual);
    // test fields prefixed with string "add" when includeCurrent not set then the ds field show not show up
    Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "add", false));
    // test fields prefixed with string "add" when includeCurrent is set the ds field should also show up
    Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true), new Field("addiffField1", false))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "add", true));
    // test fields prefixed with string "ADD" (case insensitive)
    Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true), new Field("addiffField1", false))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "ADD", true));
}
Also used : EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) DatasetField(io.cdap.cdap.proto.metadata.lineage.DatasetField) Field(io.cdap.cdap.proto.metadata.lineage.Field) InputField(io.cdap.cdap.api.lineage.field.InputField) MetadataEntity(io.cdap.cdap.api.metadata.MetadataEntity) Schema(io.cdap.cdap.api.data.schema.Schema) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) TableProperties(io.cdap.cdap.api.dataset.table.TableProperties) DatasetId(io.cdap.cdap.proto.id.DatasetId) Test(org.junit.Test)

Example 29 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class LineageOperationProcessorTest method testSimpleJoinWithAdditionalFields.

@Test
public void testSimpleJoinWithAdditionalFields() {
    Set<Connection> connections = new HashSet<>();
    connections.add(new Connection("n1", "n3"));
    connections.add(new Connection("n2", "n3"));
    connections.add(new Connection("n3", "n4"));
    EndPoint cEndPoint = EndPoint.of("default", "customer");
    EndPoint pEndPoint = EndPoint.of("default", "purchase");
    EndPoint cpEndPoint = EndPoint.of("default", "customer_purchase");
    // customer -> (id)------------
    // |
    // JOIN  ------->(id, customer_id)
    // |
    // purchase -> (customer_id)---
    Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
    stageOperations.put("n1", Collections.singletonList(new FieldReadOperation("ReadCustomer", "read description", cEndPoint, "id", "name")));
    stageOperations.put("n2", Collections.singletonList(new FieldReadOperation("ReadPurchase", "read description", pEndPoint, "customer_id", "item")));
    List<FieldOperation> operationsFromJoin = new ArrayList<>();
    operationsFromJoin.add(new FieldTransformOperation("Join", "Join Operation", Arrays.asList("n1.id", "n2.customer_id"), Arrays.asList("id", "customer_id")));
    operationsFromJoin.add(new FieldTransformOperation("Identity name", "Identity Operation", Collections.singletonList("n1.name"), Collections.singletonList("name")));
    operationsFromJoin.add(new FieldTransformOperation("Identity item", "Identity Operation", Collections.singletonList("n2.item"), Collections.singletonList("item")));
    stageOperations.put("n3", operationsFromJoin);
    stageOperations.put("n4", Collections.singletonList(new FieldWriteOperation("Write", "write description", cpEndPoint, "id", "name", "customer_id", "item")));
    LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.singleton("n3"));
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(new ReadOperation("n1.ReadCustomer", "read description", cEndPoint, "id", "name"));
    expectedOperations.add(new ReadOperation("n2.ReadPurchase", "read description", pEndPoint, "customer_id", "item"));
    expectedOperations.add(new TransformOperation("n3.Join", "Join Operation", Arrays.asList(InputField.of("n1.ReadCustomer", "id"), InputField.of("n2.ReadPurchase", "customer_id")), "id", "customer_id"));
    expectedOperations.add(new TransformOperation("n3.Identity name", "Identity Operation", Collections.singletonList(InputField.of("n1.ReadCustomer", "name")), "name"));
    expectedOperations.add(new TransformOperation("n3.Identity item", "Identity Operation", Collections.singletonList(InputField.of("n2.ReadPurchase", "item")), "item"));
    expectedOperations.add(new WriteOperation("n4.Write", "write description", cpEndPoint, Arrays.asList(InputField.of("n3.Join", "id"), InputField.of("n3.Identity name", "name"), InputField.of("n3.Join", "customer_id"), InputField.of("n3.Identity item", "item"))));
    Set<Operation> processedOperations = processor.process();
    Assert.assertEquals(expectedOperations, processedOperations);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) HashMap(java.util.HashMap) Connection(io.cdap.cdap.etl.proto.Connection) ArrayList(java.util.ArrayList) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 30 with EndPoint

use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.

the class LineageOperationProcessorTest method testMergeOperationsNonRepeat.

@Test
public void testMergeOperationsNonRepeat() {
    // n1 -> n3 ----
    // |---- n5
    // n2 -> n4 ----
    // operations (n1) -> (id, name)
    // (n3) -> (body, offset)
    // (n2.id) -> id
    // (n2.name) -> name
    // (n4.body) -> (id, name)
    // (n5) -> (id, name)
    Set<Connection> connections = new HashSet<>();
    connections.add(new Connection("n1", "n3"));
    connections.add(new Connection("n3", "n5"));
    connections.add(new Connection("n2", "n4"));
    connections.add(new Connection("n4", "n5"));
    EndPoint src1 = EndPoint.of("default", "n1");
    EndPoint src2 = EndPoint.of("default", "n2");
    EndPoint dest = EndPoint.of("default", "n5");
    Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
    stageOperations.put("n1", Collections.singletonList(new FieldReadOperation("read1", "read description", src1, "id", "name")));
    stageOperations.put("n2", Collections.singletonList(new FieldReadOperation("read2", "read description", src2, "body", "offset")));
    List<FieldOperation> n3Operations = stageOperations.computeIfAbsent("n3", k -> new ArrayList<>());
    n3Operations.add(new FieldTransformOperation("identity1", "identity", Collections.singletonList("id"), "id"));
    n3Operations.add(new FieldTransformOperation("identity2", "identity", Collections.singletonList("name"), "name"));
    stageOperations.put("n4", Collections.singletonList(new FieldTransformOperation("generate", "generate", Collections.singletonList("body"), "id", "name")));
    stageOperations.put("n5", Collections.singletonList(new FieldWriteOperation("write", "write", dest, "id", "name")));
    LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.emptySet());
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(new ReadOperation("n1.read1", "read description", src1, "id", "name"));
    expectedOperations.add(new ReadOperation("n2.read2", "read description", src2, "body", "offset"));
    expectedOperations.add(new TransformOperation("n3.identity1", "identity", Collections.singletonList(InputField.of("n1.read1", "id")), "id"));
    expectedOperations.add(new TransformOperation("n3.identity2", "identity", Collections.singletonList(InputField.of("n1.read1", "name")), "name"));
    expectedOperations.add(new TransformOperation("n4.generate", "generate", Collections.singletonList(InputField.of("n2.read2", "body")), "id", "name"));
    expectedOperations.add(new TransformOperation("n3,n4.merge.id", "Merged stages: n3,n4", Arrays.asList(InputField.of("n3.identity1", "id"), InputField.of("n4.generate", "id")), "id"));
    expectedOperations.add(new TransformOperation("n3,n4.merge.name", "Merged stages: n3,n4", Arrays.asList(InputField.of("n3.identity2", "name"), InputField.of("n4.generate", "name")), "name"));
    expectedOperations.add(new TransformOperation("n3,n4.merge.body", "Merged stages: n3,n4", Collections.singletonList(InputField.of("n2.read2", "body")), "body"));
    expectedOperations.add(new TransformOperation("n3,n4.merge.offset", "Merged stages: n3,n4", Collections.singletonList(InputField.of("n2.read2", "offset")), "offset"));
    expectedOperations.add(new WriteOperation("n5.write", "write", dest, Arrays.asList(InputField.of("n3,n4.merge.id", "id"), InputField.of("n3,n4.merge.name", "name"))));
    Set<Operation> process = processor.process();
    Assert.assertEquals(expectedOperations, process);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) HashMap(java.util.HashMap) Connection(io.cdap.cdap.etl.proto.Connection) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldReadOperation(io.cdap.cdap.etl.api.lineage.field.FieldReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) FieldWriteOperation(io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation) FieldTransformOperation(io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)33 HashSet (java.util.HashSet)28 Test (org.junit.Test)26 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)24 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)24 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)24 Operation (io.cdap.cdap.api.lineage.field.Operation)23 ArrayList (java.util.ArrayList)19 HashMap (java.util.HashMap)14 List (java.util.List)11 ImmutableList (com.google.common.collect.ImmutableList)10 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)10 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)10 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)10 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)10 Connection (io.cdap.cdap.etl.proto.Connection)10 ImmutableSet (com.google.common.collect.ImmutableSet)9 Set (java.util.Set)9 EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)8 LinkedHashSet (java.util.LinkedHashSet)7