Search in sources :

Example 1 with EndPointField

use of io.cdap.cdap.data2.metadata.lineage.field.EndPointField in project cdap by caskdata.

the class LineageHTTPHandler method datasetFieldLineageDetails.

/**
 * Get the operation details about the specified field in one dataset.
 *
 * @param field the field name to compute field operation details
 * @param directionStr the direction to compute the field level lineage, can be INCOMING, OUTGOING or BOTH
 * @param startStr the start time string, it can be a specific timestamp in milliseconds or a relative time,
 *                 using now and times added to it.
 * @param endStr the end time string, it can be a specific timestamp in milliseconds or a relative time,
 *               using now and times added to it.
 */
@GET
@Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields/{field-name}/operations")
public void datasetFieldLineageDetails(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @PathParam("field-name") String field, @QueryParam("direction") @DefaultValue("both") String directionStr, @QueryParam("start") String startStr, @QueryParam("end") String endStr) throws Exception {
    accessEnforcer.enforce(new DatasetId(namespaceId, datasetId), authenticationContext.getPrincipal(), StandardPermission.GET);
    TimeRange range = parseRange(startStr, endStr);
    Constants.FieldLineage.Direction direction = parseDirection(directionStr);
    EndPointField endPointField = new EndPointField(EndPoint.of(namespaceId, datasetId), field);
    FieldLineageDetails details = fieldLineageAdmin.getOperationDetails(direction, endPointField, range.getStart(), range.getEnd());
    responder.sendJson(HttpResponseStatus.OK, GSON.toJson(details));
}
Also used : EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) FieldLineageDetails(io.cdap.cdap.proto.metadata.lineage.FieldLineageDetails) DatasetId(io.cdap.cdap.proto.id.DatasetId) Path(javax.ws.rs.Path) GET(javax.ws.rs.GET)

Example 2 with EndPointField

use of io.cdap.cdap.data2.metadata.lineage.field.EndPointField in project cdap by caskdata.

the class LineageHTTPHandler method datasetFieldLineageSummary.

/**
 * Get the field level lineage about the specified field in one dataset.
 *
 * @param field the field name to compute field level lineage
 * @param directionStr the direction to compute the field level lineage, can be INCOMING, OUTGOING or BOTH
 * @param startStr the start time string, it can be a specific timestamp in milliseconds or a relative time,
 *                 using now and times added to it.
 * @param endStr the end time string, it can be a specific timestamp in milliseconds or a relative time,
 *               using now and times added to it.
 */
@GET
@Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields/{field-name}")
public void datasetFieldLineageSummary(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @PathParam("field-name") String field, @QueryParam("direction") String directionStr, @QueryParam("start") String startStr, @QueryParam("end") String endStr) throws Exception {
    accessEnforcer.enforce(new DatasetId(namespaceId, datasetId), authenticationContext.getPrincipal(), StandardPermission.GET);
    TimeRange range = parseRange(startStr, endStr);
    Constants.FieldLineage.Direction direction = parseDirection(directionStr);
    EndPointField endPointField = new EndPointField(EndPoint.of(namespaceId, datasetId), field);
    FieldLineageSummary summary = fieldLineageAdmin.getFieldLineage(direction, endPointField, range.getStart(), range.getEnd());
    responder.sendJson(HttpResponseStatus.OK, GSON.toJson(summary));
}
Also used : FieldLineageSummary(io.cdap.cdap.proto.metadata.lineage.FieldLineageSummary) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) DatasetId(io.cdap.cdap.proto.id.DatasetId) Path(javax.ws.rs.Path) GET(javax.ws.rs.GET)

Example 3 with EndPointField

use of io.cdap.cdap.data2.metadata.lineage.field.EndPointField in project cdap by caskdata.

the class FieldLineageAdmin method getDatasetFieldLineage.

/**
 * Get the summary for the specified dataset over a given time range depending on the direction specified.
 * The summary will contain all the field level lineage relations about all the fields in a dataset.
 *
 * @param direction the direction in which summary need to be computed
 * @param endPoint the EndPoint whicn represents the dataset that field level lineage needs to get computed
 * @param start start time (inclusive) in milliseconds
 * @param end end time (exclusive) in milliseconds
 * @return the summary which contains all the field level lineage information about all the fields in a dataset
 * @throws IOException if fails to get teh schema of the dataset
 */
public DatasetFieldLineageSummary getDatasetFieldLineage(Constants.FieldLineage.Direction direction, EndPoint endPoint, long start, long end) throws IOException {
    Set<String> lineageFields = fieldLineageReader.getFields(endPoint, start, end);
    Map<DatasetId, Set<FieldRelation>> incomingRelations = new HashMap<>();
    Map<DatasetId, Set<FieldRelation>> outgoingRelations = new HashMap<>();
    Map<DatasetId, Integer> fieldCount = new HashMap<>();
    for (String field : lineageFields) {
        EndPointField endPointField = new EndPointField(endPoint, field);
        // compute the incoming field level lineage
        if (direction == Constants.FieldLineage.Direction.INCOMING || direction == Constants.FieldLineage.Direction.BOTH) {
            Map<DatasetId, Set<String>> incomingSummary = convertSummaryToDatasetMap(fieldLineageReader.getIncomingSummary(endPointField, start, end));
            // compute the field count for all incoming datasets
            incomingSummary.keySet().forEach(datasetId -> {
                fieldCount.computeIfAbsent(datasetId, missingDataset -> missingDataset == null ? 0 : fieldLineageReader.getFields(EndPoint.of(missingDataset.getNamespace(), missingDataset.getDataset()), start, end).size());
            });
            // here the field itself will be the destination
            computeAndAddRelations(incomingRelations, field, true, incomingSummary);
        }
        // compute the outgoing field level lineage
        if (direction == Constants.FieldLineage.Direction.OUTGOING || direction == Constants.FieldLineage.Direction.BOTH) {
            Map<DatasetId, Set<String>> outgoingSummary = convertSummaryToDatasetMap(fieldLineageReader.getOutgoingSummary(endPointField, start, end));
            // compute the field count for all outgoing datasets
            outgoingSummary.keySet().forEach(datasetId -> {
                fieldCount.computeIfAbsent(datasetId, missingDataset -> missingDataset == null ? 0 : fieldLineageReader.getFields(EndPoint.of(missingDataset.getNamespace(), missingDataset.getDataset()), start, end).size());
            });
            // here the field itself will be the source
            computeAndAddRelations(outgoingRelations, field, false, outgoingSummary);
        }
    }
    Set<String> noLineageFields = getFieldsWithNoFieldLineage(endPoint, lineageFields);
    Set<String> allFields = ImmutableSet.<String>builder().addAll(lineageFields).addAll(noLineageFields).build();
    return new DatasetFieldLineageSummary(direction, start, end, new DatasetId(endPoint.getNamespace(), endPoint.getName()), allFields, fieldCount, incomingRelations, outgoingRelations);
}
Also used : HashSet(java.util.HashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) HashMap(java.util.HashMap) DatasetId(io.cdap.cdap.proto.id.DatasetId)

Example 4 with EndPointField

use of io.cdap.cdap.data2.metadata.lineage.field.EndPointField in project cdap by caskdata.

the class FieldLineageAdmin method convertSummaryToDatasetMap.

private Map<DatasetId, Set<String>> convertSummaryToDatasetMap(Set<EndPointField> summary) {
    Map<DatasetId, Set<String>> endPointFields = new HashMap<>();
    for (EndPointField endPointField : summary) {
        EndPoint endPoint = endPointField.getEndPoint();
        // this can be null if the field is not related to any dataset, it can either be generated or dropped
        DatasetId datasetId = (endPoint.getNamespace() == null || endPoint.getName() == null) ? null : new DatasetId(endPoint.getNamespace(), endPoint.getName());
        Set<String> fields = endPointFields.computeIfAbsent(datasetId, k -> new HashSet<>());
        fields.add(endPointField.getField());
    }
    return endPointFields;
}
Also used : HashSet(java.util.HashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) HashMap(java.util.HashMap) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) DatasetId(io.cdap.cdap.proto.id.DatasetId)

Example 5 with EndPointField

use of io.cdap.cdap.data2.metadata.lineage.field.EndPointField in project cdap by caskdata.

the class FieldLineageAdminTest method testDatasetFieldLineageSummary.

@Test
public void testDatasetFieldLineageSummary() throws Exception {
    // the dataset fields
    Set<String> fields = ImmutableSet.of("field1", "field2", "field3");
    ImmutableMap.Builder<EndPoint, Set<String>> allFields = ImmutableMap.builder();
    /*
      Incoming fields
      src1: src1f1 -> field1
            src1f2 -> field1
      src2: src2f1 -> field1

      src2: src2f2 -> field2
      src3: src3f1 -> field2

      src3: src3f2 -> field3
     */
    EndPoint src1 = EndPoint.of("ns1", "src1");
    EndPoint src2 = EndPoint.of("ns1", "src2");
    EndPoint src3 = EndPoint.of("ns1", "src3");
    Map<String, Set<EndPointField>> incomings = ImmutableMap.of("field1", ImmutableSet.of(new EndPointField(src1, "src1f1"), new EndPointField(src1, "src1f2"), new EndPointField(src2, "src2f1")), "field2", ImmutableSet.of(new EndPointField(src2, "src2f2"), new EndPointField(src3, "src3f1")), "field3", ImmutableSet.of(new EndPointField(src3, "src3f2")));
    allFields.put(src1, ImmutableSet.of("src1f1", "src1f2", "src1f3"));
    allFields.put(src2, ImmutableSet.of("src2f1", "src2f2"));
    allFields.put(src3, ImmutableSet.of("src3f1", "src3f2"));
    /*
      Outgoing fields
      dest1: field1 -> dest1f1
      dest2: field1 -> dest2f1

      dest1: field2 -> dest1f2
      dest2: field2 -> dest2f1

      dest2: field3 -> dest2f2
     */
    EndPoint dest1 = EndPoint.of("ns1", "dest1");
    EndPoint dest2 = EndPoint.of("ns1", "dest2");
    Map<String, Set<EndPointField>> outgoings = ImmutableMap.of("field1", ImmutableSet.of(new EndPointField(dest1, "dest1f1"), new EndPointField(dest2, "dest2f1")), "field2", ImmutableSet.of(new EndPointField(dest1, "dest1f2"), new EndPointField(dest2, "dest2f1")), "field3", ImmutableSet.of(new EndPointField(dest2, "dest2f2")));
    allFields.put(dest1, ImmutableSet.of("dest1f1", "dest1f2", "dest1f3", "dest1f4"));
    allFields.put(dest2, ImmutableSet.of("dest2f1", "dest2f2"));
    FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(fields, Collections.emptySet(), incomings, outgoings, Collections.emptySet(), allFields.build()), metadataAdmin);
    // input dataset name does not matter since we use a mocked reader
    DatasetFieldLineageSummary summary = fieldLineageAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("ns1", "ds1"), 0L, Long.MAX_VALUE);
    Assert.assertEquals(Constants.FieldLineage.Direction.BOTH, summary.getDirection());
    Assert.assertEquals(0L, summary.getStartTs());
    Assert.assertEquals(Long.MAX_VALUE, summary.getEndTs());
    Assert.assertEquals(fields, summary.getFields());
    Assert.assertEquals(new DatasetId("ns1", "ds1"), summary.getDatasetId());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedIncomings = ImmutableSet.of(new DatasetFieldLineageSummary.FieldLineageRelations(new DatasetId("ns1", "src1"), 3, ImmutableSet.of(new FieldRelation("src1f1", "field1"), new FieldRelation("src1f2", "field1"))), new DatasetFieldLineageSummary.FieldLineageRelations(new DatasetId("ns1", "src2"), 2, ImmutableSet.of(new FieldRelation("src2f1", "field1"), new FieldRelation("src2f2", "field2"))), new DatasetFieldLineageSummary.FieldLineageRelations(new DatasetId("ns1", "src3"), 2, ImmutableSet.of(new FieldRelation("src3f1", "field2"), new FieldRelation("src3f2", "field3"))));
    Assert.assertEquals(expectedIncomings, summary.getIncoming());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedOutgoings = ImmutableSet.of(new DatasetFieldLineageSummary.FieldLineageRelations(new DatasetId("ns1", "dest1"), 4, ImmutableSet.of(new FieldRelation("field1", "dest1f1"), new FieldRelation("field2", "dest1f2"))), new DatasetFieldLineageSummary.FieldLineageRelations(new DatasetId("ns1", "dest2"), 2, ImmutableSet.of(new FieldRelation("field1", "dest2f1"), new FieldRelation("field2", "dest2f1"), new FieldRelation("field3", "dest2f2"))));
    Assert.assertEquals(expectedOutgoings, summary.getOutgoing());
}
Also used : HashSet(java.util.HashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) ImmutableMap(com.google.common.collect.ImmutableMap) DatasetId(io.cdap.cdap.proto.id.DatasetId) Test(org.junit.Test)

Aggregations

EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)10 HashSet (java.util.HashSet)8 DatasetId (io.cdap.cdap.proto.id.DatasetId)6 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)5 Test (org.junit.Test)5 ImmutableSet (com.google.common.collect.ImmutableSet)3 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)3 ArrayList (java.util.ArrayList)3 Set (java.util.Set)3 Operation (io.cdap.cdap.api.lineage.field.Operation)2 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)2 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)2 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)2 LineageStoreReader (io.cdap.cdap.data2.metadata.lineage.LineageStoreReader)2 FieldLineageInfo (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)2 FieldLineageReader (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageReader)2 FieldLineageWriter (io.cdap.cdap.data2.metadata.writer.FieldLineageWriter)2 LineageWriter (io.cdap.cdap.data2.metadata.writer.LineageWriter)2 MessagingLineageWriter (io.cdap.cdap.data2.metadata.writer.MessagingLineageWriter)2 NamespacedEntityId (io.cdap.cdap.proto.id.NamespacedEntityId)2