use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class LineageOperationProcessorTest method testSourceWithMultipleDestinations.
@Test
public void testSourceWithMultipleDestinations() {
// |----->n3
// n1--->n2-----|
// |----->n4
// n1 => read: file -> (offset, body)
// n2 => parse: body -> (id, name, address, zip)
// n3 => write1: (parse.id, parse.name) -> info
// n4 => write2: (parse.address, parse.zip) -> location
Set<Connection> connections = new HashSet<>();
connections.add(new Connection("n1", "n2"));
connections.add(new Connection("n2", "n3"));
connections.add(new Connection("n3", "n4"));
EndPoint source = EndPoint.of("ns", "file");
EndPoint info = EndPoint.of("ns", "info");
EndPoint location = EndPoint.of("ns", "location");
Map<String, List<FieldOperation>> stageOperations = new HashMap<>();
List<FieldOperation> fieldOperations = new ArrayList<>();
fieldOperations.add(new FieldReadOperation("read", "reading from file", source, "offset", "body"));
stageOperations.put("n1", fieldOperations);
fieldOperations = new ArrayList<>();
fieldOperations.add(new FieldTransformOperation("parse", "parsing body", Collections.singletonList("body"), "id", "name", "address", "zip"));
stageOperations.put("n2", fieldOperations);
fieldOperations = new ArrayList<>();
fieldOperations.add(new FieldWriteOperation("infoWrite", "writing info", info, "id", "name"));
stageOperations.put("n3", fieldOperations);
fieldOperations = new ArrayList<>();
fieldOperations.add(new FieldWriteOperation("locationWrite", "writing location", location, "address", "zip"));
stageOperations.put("n4", fieldOperations);
LineageOperationsProcessor processor = new LineageOperationsProcessor(connections, stageOperations, Collections.emptySet());
Set<Operation> processedOperations = processor.process();
Set<Operation> expectedOperations = new HashSet<>();
ReadOperation read = new ReadOperation("n1.read", "reading from file", source, "offset", "body");
expectedOperations.add(read);
TransformOperation parse = new TransformOperation("n2.parse", "parsing body", Collections.singletonList(InputField.of("n1.read", "body")), "id", "name", "address", "zip");
expectedOperations.add(parse);
WriteOperation infoWrite = new WriteOperation("n3.infoWrite", "writing info", info, InputField.of("n2.parse", "id"), InputField.of("n2.parse", "name"));
expectedOperations.add(infoWrite);
WriteOperation locationWrite = new WriteOperation("n4.locationWrite", "writing location", location, InputField.of("n2.parse", "address"), InputField.of("n2.parse", "zip"));
expectedOperations.add(locationWrite);
Assert.assertEquals(new FieldLineageInfo(expectedOperations), new FieldLineageInfo(processedOperations));
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageAdminTest method testFields.
@Test
public void testFields() throws Exception {
FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(getFieldNames(), Collections.emptySet(), Collections.emptySet()), metadataAdmin);
EndPoint endPoint = EndPoint.of("ns", "file");
// test all fields
Assert.assertEquals(getFields(getFieldNames()), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, false));
// test fields prefixed with string "add"
Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "add", false));
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageAdminTest method testDatasetFieldLineageSummary.
@Test
public void testDatasetFieldLineageSummary() throws Exception {
// the dataset fields
Set<String> fields = ImmutableSet.of("field1", "field2", "field3");
ImmutableMap.Builder<EndPoint, Set<String>> allFields = ImmutableMap.builder();
/*
Incoming fields
src1: src1f1 -> field1
src1f2 -> field1
src2: src2f1 -> field1
src2: src2f2 -> field2
src3: src3f1 -> field2
src3: src3f2 -> field3
*/
EndPoint src1 = EndPoint.of("ns1", "src1");
EndPoint src2 = EndPoint.of("ns1", "src2");
EndPoint src3 = EndPoint.of("ns1", "src3");
Map<String, Set<EndPointField>> incomings = ImmutableMap.of("field1", ImmutableSet.of(new EndPointField(src1, "src1f1"), new EndPointField(src1, "src1f2"), new EndPointField(src2, "src2f1")), "field2", ImmutableSet.of(new EndPointField(src2, "src2f2"), new EndPointField(src3, "src3f1")), "field3", ImmutableSet.of(new EndPointField(src3, "src3f2")));
allFields.put(src1, ImmutableSet.of("src1f1", "src1f2", "src1f3"));
allFields.put(src2, ImmutableSet.of("src2f1", "src2f2"));
allFields.put(src3, ImmutableSet.of("src3f1", "src3f2"));
/*
Outgoing fields
dest1: field1 -> dest1f1
dest2: field1 -> dest2f1
dest1: field2 -> dest1f2
dest2: field2 -> dest2f1
dest2: field3 -> dest2f2
*/
EndPoint dest1 = EndPoint.of("ns1", "dest1");
EndPoint dest2 = EndPoint.of("ns1", "dest2");
Map<String, Set<EndPointField>> outgoings = ImmutableMap.of("field1", ImmutableSet.of(new EndPointField(dest1, "dest1f1"), new EndPointField(dest2, "dest2f1")), "field2", ImmutableSet.of(new EndPointField(dest1, "dest1f2"), new EndPointField(dest2, "dest2f1")), "field3", ImmutableSet.of(new EndPointField(dest2, "dest2f2")));
allFields.put(dest1, ImmutableSet.of("dest1f1", "dest1f2", "dest1f3", "dest1f4"));
allFields.put(dest2, ImmutableSet.of("dest2f1", "dest2f2"));
FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(fields, Collections.emptySet(), incomings, outgoings, Collections.emptySet(), allFields.build()), metadataAdmin);
// input dataset name does not matter since we use a mocked reader
DatasetFieldLineageSummary summary = fieldLineageAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("ns1", "ds1"), 0L, Long.MAX_VALUE);
Assert.assertEquals(Constants.FieldLineage.Direction.BOTH, summary.getDirection());
Assert.assertEquals(0L, summary.getStartTs());
Assert.assertEquals(Long.MAX_VALUE, summary.getEndTs());
Assert.assertEquals(fields, summary.getFields());
Assert.assertEquals(new DatasetId("ns1", "ds1"), summary.getDatasetId());
Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedIncomings = ImmutableSet.of(new DatasetFieldLineageSummary.FieldLineageRelations(new DatasetId("ns1", "src1"), 3, ImmutableSet.of(new FieldRelation("src1f1", "field1"), new FieldRelation("src1f2", "field1"))), new DatasetFieldLineageSummary.FieldLineageRelations(new DatasetId("ns1", "src2"), 2, ImmutableSet.of(new FieldRelation("src2f1", "field1"), new FieldRelation("src2f2", "field2"))), new DatasetFieldLineageSummary.FieldLineageRelations(new DatasetId("ns1", "src3"), 2, ImmutableSet.of(new FieldRelation("src3f1", "field2"), new FieldRelation("src3f2", "field3"))));
Assert.assertEquals(expectedIncomings, summary.getIncoming());
Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedOutgoings = ImmutableSet.of(new DatasetFieldLineageSummary.FieldLineageRelations(new DatasetId("ns1", "dest1"), 4, ImmutableSet.of(new FieldRelation("field1", "dest1f1"), new FieldRelation("field2", "dest1f2"))), new DatasetFieldLineageSummary.FieldLineageRelations(new DatasetId("ns1", "dest2"), 2, ImmutableSet.of(new FieldRelation("field1", "dest2f1"), new FieldRelation("field2", "dest2f1"), new FieldRelation("field3", "dest2f2"))));
Assert.assertEquals(expectedOutgoings, summary.getOutgoing());
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageAdminTest method testSummary.
@Test
public void testSummary() {
FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(Collections.emptySet(), summary(), Collections.emptySet()), metadataAdmin);
EndPoint endPoint = EndPoint.of("ns", "file");
DatasetField datasetField = new DatasetField(new DatasetId("ns", "file"), new HashSet<>(Arrays.asList("a", "b", "c")));
DatasetField anotherDatasetField = new DatasetField(new DatasetId("ns", "anotherfile"), new HashSet<>(Arrays.asList("x", "y", "z")));
Set<DatasetField> expected = new HashSet<>();
expected.add(datasetField);
expected.add(anotherDatasetField);
// input args to the getFieldLineage below does not matter since data returned is mocked
FieldLineageSummary summary = fieldLineageAdmin.getFieldLineage(Constants.FieldLineage.Direction.INCOMING, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE);
Assert.assertEquals(expected, summary.getIncoming());
Assert.assertNull(summary.getOutgoing());
summary = fieldLineageAdmin.getFieldLineage(Constants.FieldLineage.Direction.OUTGOING, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE);
Assert.assertEquals(expected, summary.getOutgoing());
Assert.assertNull(summary.getIncoming());
summary = fieldLineageAdmin.getFieldLineage(Constants.FieldLineage.Direction.BOTH, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE);
Assert.assertEquals(expected, summary.getOutgoing());
Assert.assertEquals(expected, summary.getIncoming());
}
use of io.cdap.cdap.api.lineage.field.EndPoint in project cdap by caskdata.
the class FieldLineageAdminTest method operations.
private Set<ProgramRunOperations> operations() {
ProgramId program1 = new ProgramId("ns", "app", ProgramType.SPARK, "sparkprogram");
ProgramId program2 = new ProgramId("ns", "app", ProgramType.MAPREDUCE, "mrprogram");
EndPoint endPoint1 = EndPoint.of("ns", "file");
EndPoint endPoint2 = EndPoint.of("ns", "anotherfile");
ReadOperation read = new ReadOperation("read", "reading file", endPoint1, "offset", "body");
WriteOperation write = new WriteOperation("write", "writing file", endPoint2, InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address"), InputField.of("parse", "zip"));
ProgramRunId program1Run1 = program1.run(RunIds.generate(1000));
ProgramRunId program1Run2 = program1.run(RunIds.generate(2000));
Set<ProgramRunOperations> programRunOperations = new HashSet<>();
programRunOperations.add(new ProgramRunOperations(new HashSet<>(Arrays.asList(program1Run1, program1Run2)), new HashSet<>(Arrays.asList(read, write))));
TransformOperation normalize = new TransformOperation("normalize", "normalizing offset", Collections.singletonList(InputField.of("read", "offset")), "offset");
write = new WriteOperation("write", "writing file", endPoint2, InputField.of("normalize", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address"), InputField.of("parse", "zip"));
ProgramRunId program1Run3 = program1.run(RunIds.generate(3000));
ProgramRunId program1Run4 = program1.run(RunIds.generate(5000));
ProgramRunId program2Run1 = program2.run(RunIds.generate(4000));
ProgramRunId program2Run2 = program2.run(RunIds.generate(6000));
Set<ProgramRunId> programRunIds = new HashSet<>(Arrays.asList(program1Run3, program1Run4, program2Run1, program2Run2));
Set<Operation> operations = new HashSet<>(Arrays.asList(read, normalize, write));
programRunOperations.add(new ProgramRunOperations(programRunIds, operations));
return programRunOperations;
}
Aggregations