use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageAdminTest method summary.
private Set<EndPointField> summary() {
Set<EndPointField> endPointFields = new HashSet<>();
EndPoint endPoint1 = EndPoint.of("ns", "file");
EndPoint endPoint2 = EndPoint.of("ns", "anotherfile");
endPointFields.add(new EndPointField(endPoint1, "a"));
endPointFields.add(new EndPointField(endPoint1, "b"));
endPointFields.add(new EndPointField(endPoint1, "c"));
endPointFields.add(new EndPointField(endPoint2, "x"));
endPointFields.add(new EndPointField(endPoint2, "y"));
endPointFields.add(new EndPointField(endPoint2, "z"));
return endPointFields;
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageAdminTest method testOperations.
@Test
public void testOperations() {
FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(Collections.emptySet(), Collections.emptySet(), operations()), metadataAdmin);
EndPoint endPoint = EndPoint.of("ns", "file");
// input args to the getOperationDetails below does not matter since data returned is mocked
FieldLineageDetails operationDetails = fieldLineageAdmin.getOperationDetails(Constants.FieldLineage.Direction.INCOMING, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE);
ProgramId program1 = new ProgramId("ns", "app", ProgramType.SPARK, "sparkprogram");
ProgramId program2 = new ProgramId("ns", "app", ProgramType.MAPREDUCE, "mrprogram");
ProgramRunId program1Run1 = program1.run(RunIds.generate(1000));
ProgramRunId program1Run2 = program1.run(RunIds.generate(2000));
ProgramRunId program1Run3 = program1.run(RunIds.generate(3000));
ProgramRunId program1Run4 = program1.run(RunIds.generate(5000));
ProgramRunId program2Run1 = program2.run(RunIds.generate(4000));
ProgramRunId program2Run2 = program2.run(RunIds.generate(6000));
List<ProgramFieldOperationInfo> incomings = operationDetails.getIncoming();
Set<ProgramFieldOperationInfo> expectedInfos = new HashSet<>();
List<ProgramInfo> programInfos = new ArrayList<>();
// program1Run1 and program1Run2 both generated same set of operations, however only the latest
// run will be included in the returned list. None of the run of program2 generated these set of operations.
programInfos.add(new ProgramInfo(program1, RunIds.getTime(program1Run2.getRun(), TimeUnit.SECONDS)));
EndPoint endPoint1 = EndPoint.of("ns", "file");
EndPoint endPoint2 = EndPoint.of("ns", "anotherfile");
List<FieldOperationInfo> fieldOperationInfos = new ArrayList<>();
// Return list should have topologically sorted operations
fieldOperationInfos.add(new FieldOperationInfo("read", "reading file", FieldOperationInput.of(endPoint1), FieldOperationOutput.of(Arrays.asList("offset", "body"))));
List<InputField> inputFields = new ArrayList<>();
inputFields.add(InputField.of("read", "offset"));
inputFields.add(InputField.of("parse", "name"));
inputFields.add(InputField.of("parse", "address"));
inputFields.add(InputField.of("parse", "zip"));
fieldOperationInfos.add(new FieldOperationInfo("write", "writing file", FieldOperationInput.of(inputFields), FieldOperationOutput.of(endPoint2)));
expectedInfos.add(new ProgramFieldOperationInfo(programInfos, fieldOperationInfos));
programInfos = new ArrayList<>();
// program1 and program2 both generated the next set of operations, returned list will contain the
// only one latest run of each program and that too sorted by the last execution time.
programInfos.add(new ProgramInfo(program2, RunIds.getTime(program2Run2.getRun(), TimeUnit.SECONDS)));
programInfos.add(new ProgramInfo(program1, RunIds.getTime(program1Run4.getRun(), TimeUnit.SECONDS)));
fieldOperationInfos = new ArrayList<>();
fieldOperationInfos.add(new FieldOperationInfo("read", "reading file", FieldOperationInput.of(endPoint1), FieldOperationOutput.of(Arrays.asList("offset", "body"))));
FieldOperationInput input = FieldOperationInput.of(Collections.singletonList(InputField.of("read", "offset")));
FieldOperationOutput output = FieldOperationOutput.of(Collections.singletonList("offset"));
fieldOperationInfos.add(new FieldOperationInfo("normalize", "normalizing offset", input, output));
inputFields = new ArrayList<>();
inputFields.add(InputField.of("normalize", "offset"));
inputFields.add(InputField.of("parse", "name"));
inputFields.add(InputField.of("parse", "address"));
inputFields.add(InputField.of("parse", "zip"));
input = FieldOperationInput.of(inputFields);
output = FieldOperationOutput.of(endPoint2);
fieldOperationInfos.add(new FieldOperationInfo("write", "writing file", input, output));
expectedInfos.add(new ProgramFieldOperationInfo(programInfos, fieldOperationInfos));
Assert.assertNotNull(incomings);
// converting to set because ordering in different versions of operations is not guaranteed
Assert.assertEquals(expectedInfos, new HashSet<>(incomings));
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageAdminTest method testFieldsWithDsSchema.
@Test
public void testFieldsWithDsSchema() throws Exception {
FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(getFieldNames(), Collections.emptySet(), Collections.emptySet()), metadataAdmin);
EndPoint endPoint = EndPoint.of(NamespaceId.DEFAULT.getNamespace(), "file");
// test that when there is no schema information present for the dataset and the we request for lineage with
// includeCurrent set to true we get lineage fields correctly.
Set<Field> expected = getFields(getFieldNames());
// includeCurrent set to true
Set<Field> actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, true);
Assert.assertEquals(expected, actual);
// schema with fields which are different than known to lineage store
Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("address", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("addiffField1", Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of("diffField2", Schema.nullableOf(Schema.of(Schema.Type.INT))));
// add the the dataset with the schema with fields known in lineage store
TableProperties.Builder props = TableProperties.builder();
TableProperties.setSchema(props, schema);
TableProperties.setRowFieldName(props, "name");
DatasetId datasetId = NamespaceId.DEFAULT.dataset("file");
MetadataEntity entity = datasetId.toMetadataEntity();
datasetFramework.addInstance("table", datasetId, props.build());
// wait until the metadata for this dataset has been stored
Tasks.waitFor(false, () -> metadataAdmin.getProperties(MetadataScope.SYSTEM, entity).isEmpty(), 5, TimeUnit.SECONDS);
// test all fields expected should have all the fields which was known the lineage store but should not contains
// any dataset schema field since the includeCurrent is set to false
expected = getFields(getFieldNames());
actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, false);
Assert.assertEquals(expected, actual);
// test all fields expected should have all the fields which was known the lineage store and also the fields
// which were only present in the dataset schema since includeCurrent is set to true.
// this also test that for the fields which are common in lineage store and dataset schema for example address in
// this case has their lineage info field set to true as we do have lineage for this field
expected = getFields(getFieldNames());
expected.addAll(new HashSet<>(Arrays.asList(new Field("addiffField1", false), new Field("diffField2", false))));
actual = fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, true);
Assert.assertEquals(expected, actual);
// test fields prefixed with string "add" when includeCurrent not set then the ds field show not show up
Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "add", false));
// test fields prefixed with string "add" when includeCurrent is set the ds field should also show up
Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true), new Field("addiffField1", false))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "add", true));
// test fields prefixed with string "ADD" (case insensitive)
Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true), new Field("addiffField1", false))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "ADD", true));
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class LineageOperationProcessorTest method testSimpleJoinWithAdditionalFields.
@Test
public void testSimpleJoinWithAdditionalFields() {
Set<Connection> connections = new HashSet<>();
connections.add(new Connection("n1", "n3"));
connections.add(new Connection("n2", "n3"));
connections.add(new Connection("n3", "n4"));
EndPoint cEndPoint = EndPoint.of("default", "customer");
EndPoint pEndPoint = EndPoint.of("default", "purchase");
EndPoint cpEndPoint = EndPoint.of("default", "customer_purchase");
// customer -> (id)------------
// |
// JOIN ------->(id, customer_id)
// |
// purchase -> (customer_id)---
Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
stageOperations.put("n1", Collections.singletonList(new FieldReadOperation("ReadCustomer", "read description", cEndPoint, "id", "name")));
stageOperations.put("n2", Collections.singletonList(new FieldReadOperation("ReadPurchase", "read description", pEndPoint, "customer_id", "item")));
List<FieldOperation> operationsFromJoin = new ArrayList<>();
operationsFromJoin.add(new FieldTransformOperation("Join", "Join Operation", Arrays.asList("n1.id", "n2.customer_id"), Arrays.asList("id", "customer_id")));
operationsFromJoin.add(new FieldTransformOperation("Identity name", "Identity Operation", Collections.singletonList("n1.name"), Collections.singletonList("name")));
operationsFromJoin.add(new FieldTransformOperation("Identity item", "Identity Operation", Collections.singletonList("n2.item"), Collections.singletonList("item")));
stageOperations.put("n3", operationsFromJoin);
stageOperations.put("n4", Collections.singletonList(new FieldWriteOperation("Write", "write description", cpEndPoint, "id", "name", "customer_id", "item")));
LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.singleton("n3"));
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(new ReadOperation("n1.ReadCustomer", "read description", cEndPoint, "id", "name"));
expectedOperations.add(new ReadOperation("n2.ReadPurchase", "read description", pEndPoint, "customer_id", "item"));
expectedOperations.add(new TransformOperation("n3.Join", "Join Operation", Arrays.asList(InputField.of("n1.ReadCustomer", "id"), InputField.of("n2.ReadPurchase", "customer_id")), "id", "customer_id"));
expectedOperations.add(new TransformOperation("n3.Identity name", "Identity Operation", Collections.singletonList(InputField.of("n1.ReadCustomer", "name")), "name"));
expectedOperations.add(new TransformOperation("n3.Identity item", "Identity Operation", Collections.singletonList(InputField.of("n2.ReadPurchase", "item")), "item"));
expectedOperations.add(new WriteOperation("n4.Write", "write description", cpEndPoint, Arrays.asList(InputField.of("n3.Join", "id"), InputField.of("n3.Identity name", "name"), InputField.of("n3.Join", "customer_id"), InputField.of("n3.Identity item", "item"))));
Set<Operation> processedOperations = processor.process();
Assert.assertEquals(expectedOperations, processedOperations);
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class LineageOperationProcessorTest method testMergeOperationsNonRepeat.
@Test
public void testMergeOperationsNonRepeat() {
// n1 -> n3 ----
// |---- n5
// n2 -> n4 ----
// operations (n1) -> (id, name)
// (n3) -> (body, offset)
// (n2.id) -> id
// (n2.name) -> name
// (n4.body) -> (id, name)
// (n5) -> (id, name)
Set<Connection> connections = new HashSet<>();
connections.add(new Connection("n1", "n3"));
connections.add(new Connection("n3", "n5"));
connections.add(new Connection("n2", "n4"));
connections.add(new Connection("n4", "n5"));
EndPoint src1 = EndPoint.of("default", "n1");
EndPoint src2 = EndPoint.of("default", "n2");
EndPoint dest = EndPoint.of("default", "n5");
Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
stageOperations.put("n1", Collections.singletonList(new FieldReadOperation("read1", "read description", src1, "id", "name")));
stageOperations.put("n2", Collections.singletonList(new FieldReadOperation("read2", "read description", src2, "body", "offset")));
List<FieldOperation> n3Operations = stageOperations.computeIfAbsent("n3", k -> new ArrayList<>());
n3Operations.add(new FieldTransformOperation("identity1", "identity", Collections.singletonList("id"), "id"));
n3Operations.add(new FieldTransformOperation("identity2", "identity", Collections.singletonList("name"), "name"));
stageOperations.put("n4", Collections.singletonList(new FieldTransformOperation("generate", "generate", Collections.singletonList("body"), "id", "name")));
stageOperations.put("n5", Collections.singletonList(new FieldWriteOperation("write", "write", dest, "id", "name")));
LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.emptySet());
Set<Operation> expectedOperations = new HashSet<>();
expectedOperations.add(new ReadOperation("n1.read1", "read description", src1, "id", "name"));
expectedOperations.add(new ReadOperation("n2.read2", "read description", src2, "body", "offset"));
expectedOperations.add(new TransformOperation("n3.identity1", "identity", Collections.singletonList(InputField.of("n1.read1", "id")), "id"));
expectedOperations.add(new TransformOperation("n3.identity2", "identity", Collections.singletonList(InputField.of("n1.read1", "name")), "name"));
expectedOperations.add(new TransformOperation("n4.generate", "generate", Collections.singletonList(InputField.of("n2.read2", "body")), "id", "name"));
expectedOperations.add(new TransformOperation("n3,n4.merge.id", "Merged stages: n3,n4", Arrays.asList(InputField.of("n3.identity1", "id"), InputField.of("n4.generate", "id")), "id"));
expectedOperations.add(new TransformOperation("n3,n4.merge.name", "Merged stages: n3,n4", Arrays.asList(InputField.of("n3.identity2", "name"), InputField.of("n4.generate", "name")), "name"));
expectedOperations.add(new TransformOperation("n3,n4.merge.body", "Merged stages: n3,n4", Collections.singletonList(InputField.of("n2.read2", "body")), "body"));
expectedOperations.add(new TransformOperation("n3,n4.merge.offset", "Merged stages: n3,n4", Collections.singletonList(InputField.of("n2.read2", "offset")), "offset"));
expectedOperations.add(new WriteOperation("n5.write", "write", dest, Arrays.asList(InputField.of("n3,n4.merge.id", "id"), InputField.of("n3,n4.merge.name", "name"))));
Set<Operation> process = processor.process();
Assert.assertEquals(expectedOperations, process);
}
Aggregations