Search in sources :

Example 6 with DatasetId

use of io.cdap.cdap.proto.id.DatasetId in project cdap by caskdata.

the class LineageAdmin method filterAndAddRelations.

/**
 * Filter the relations based on the rollUp flag, if set to true, the method will replace the inner program with
 * the workflow using the map and ignore the local datasets relations. The local dataset always ends with the run
 * id of the workflow. The set of filtered local datasets is returned
 */
private Set<DatasetId> filterAndAddRelations(boolean rollUpWorkflow, Multimap<RelationKey, Relation> relations, Map<ProgramRunId, ProgramRunId> programWorkflowMap, Set<Relation> relationss) {
    Set<DatasetId> localDatasets = new HashSet<>();
    for (Relation relation : relationss) {
        if (rollUpWorkflow && programWorkflowMap.containsKey(relation.getProgramRunId())) {
            ProgramRunId workflowId = programWorkflowMap.get(relation.getProgramRunId());
            // skip the relation for local datasets, local datasets always end with the workflow run id
            DatasetId data = (DatasetId) relation.getData();
            if (data.getDataset().endsWith(workflowId.getRun())) {
                localDatasets.add(data);
                continue;
            }
            relation = new Relation(data, workflowId.getParent(), relation.getAccess(), RunIds.fromString(workflowId.getRun()));
        }
        relations.put(new RelationKey(relation), relation);
    }
    return localDatasets;
}
Also used : Relation(io.cdap.cdap.data2.metadata.lineage.Relation) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) DatasetId(io.cdap.cdap.proto.id.DatasetId) HashSet(java.util.HashSet)

Example 7 with DatasetId

use of io.cdap.cdap.proto.id.DatasetId in project cdap by caskdata.

the class LineageHTTPHandler method datasetFieldLineageSummary.

/**
 * Get the field level lineage about the specified field in one dataset.
 *
 * @param field the field name to compute field level lineage
 * @param directionStr the direction to compute the field level lineage, can be INCOMING, OUTGOING or BOTH
 * @param startStr the start time string, it can be a specific timestamp in milliseconds or a relative time,
 *                 using now and times added to it.
 * @param endStr the end time string, it can be a specific timestamp in milliseconds or a relative time,
 *               using now and times added to it.
 */
@GET
@Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields/{field-name}")
public void datasetFieldLineageSummary(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @PathParam("field-name") String field, @QueryParam("direction") String directionStr, @QueryParam("start") String startStr, @QueryParam("end") String endStr) throws Exception {
    accessEnforcer.enforce(new DatasetId(namespaceId, datasetId), authenticationContext.getPrincipal(), StandardPermission.GET);
    TimeRange range = parseRange(startStr, endStr);
    Constants.FieldLineage.Direction direction = parseDirection(directionStr);
    EndPointField endPointField = new EndPointField(EndPoint.of(namespaceId, datasetId), field);
    FieldLineageSummary summary = fieldLineageAdmin.getFieldLineage(direction, endPointField, range.getStart(), range.getEnd());
    responder.sendJson(HttpResponseStatus.OK, GSON.toJson(summary));
}
Also used : FieldLineageSummary(io.cdap.cdap.proto.metadata.lineage.FieldLineageSummary) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) DatasetId(io.cdap.cdap.proto.id.DatasetId) Path(javax.ws.rs.Path) GET(javax.ws.rs.GET)

Example 8 with DatasetId

use of io.cdap.cdap.proto.id.DatasetId in project cdap by caskdata.

the class LineageHTTPHandler method datasetFieldLineageDetails.

/**
 * Get the operation details about the specified field in one dataset.
 *
 * @param field the field name to compute field operation details
 * @param directionStr the direction to compute the field level lineage, can be INCOMING, OUTGOING or BOTH
 * @param startStr the start time string, it can be a specific timestamp in milliseconds or a relative time,
 *                 using now and times added to it.
 * @param endStr the end time string, it can be a specific timestamp in milliseconds or a relative time,
 *               using now and times added to it.
 */
@GET
@Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields/{field-name}/operations")
public void datasetFieldLineageDetails(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @PathParam("field-name") String field, @QueryParam("direction") @DefaultValue("both") String directionStr, @QueryParam("start") String startStr, @QueryParam("end") String endStr) throws Exception {
    accessEnforcer.enforce(new DatasetId(namespaceId, datasetId), authenticationContext.getPrincipal(), StandardPermission.GET);
    TimeRange range = parseRange(startStr, endStr);
    Constants.FieldLineage.Direction direction = parseDirection(directionStr);
    EndPointField endPointField = new EndPointField(EndPoint.of(namespaceId, datasetId), field);
    FieldLineageDetails details = fieldLineageAdmin.getOperationDetails(direction, endPointField, range.getStart(), range.getEnd());
    responder.sendJson(HttpResponseStatus.OK, GSON.toJson(details));
}
Also used : EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) FieldLineageDetails(io.cdap.cdap.proto.metadata.lineage.FieldLineageDetails) DatasetId(io.cdap.cdap.proto.id.DatasetId) Path(javax.ws.rs.Path) GET(javax.ws.rs.GET)

Example 9 with DatasetId

use of io.cdap.cdap.proto.id.DatasetId in project cdap by caskdata.

the class FieldLineageAdmin method convertSummaryToDatasetMap.

private Map<DatasetId, Set<String>> convertSummaryToDatasetMap(Set<EndPointField> summary) {
    Map<DatasetId, Set<String>> endPointFields = new HashMap<>();
    for (EndPointField endPointField : summary) {
        EndPoint endPoint = endPointField.getEndPoint();
        // this can be null if the field is not related to any dataset, it can either be generated or dropped
        DatasetId datasetId = (endPoint.getNamespace() == null || endPoint.getName() == null) ? null : new DatasetId(endPoint.getNamespace(), endPoint.getName());
        Set<String> fields = endPointFields.computeIfAbsent(datasetId, k -> new HashSet<>());
        fields.add(endPointField.getField());
    }
    return endPointFields;
}
Also used : HashSet(java.util.HashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) HashMap(java.util.HashMap) EndPoint(io.cdap.cdap.api.lineage.field.EndPoint) DatasetId(io.cdap.cdap.proto.id.DatasetId)

Example 10 with DatasetId

use of io.cdap.cdap.proto.id.DatasetId in project cdap by caskdata.

the class FieldLineageAdmin method getDatasetFieldLineage.

/**
 * Get the summary for the specified dataset over a given time range depending on the direction specified.
 * The summary will contain all the field level lineage relations about all the fields in a dataset.
 *
 * @param direction the direction in which summary need to be computed
 * @param endPoint the EndPoint whicn represents the dataset that field level lineage needs to get computed
 * @param start start time (inclusive) in milliseconds
 * @param end end time (exclusive) in milliseconds
 * @return the summary which contains all the field level lineage information about all the fields in a dataset
 * @throws IOException if fails to get teh schema of the dataset
 */
public DatasetFieldLineageSummary getDatasetFieldLineage(Constants.FieldLineage.Direction direction, EndPoint endPoint, long start, long end) throws IOException {
    Set<String> lineageFields = fieldLineageReader.getFields(endPoint, start, end);
    Map<DatasetId, Set<FieldRelation>> incomingRelations = new HashMap<>();
    Map<DatasetId, Set<FieldRelation>> outgoingRelations = new HashMap<>();
    Map<DatasetId, Integer> fieldCount = new HashMap<>();
    for (String field : lineageFields) {
        EndPointField endPointField = new EndPointField(endPoint, field);
        // compute the incoming field level lineage
        if (direction == Constants.FieldLineage.Direction.INCOMING || direction == Constants.FieldLineage.Direction.BOTH) {
            Map<DatasetId, Set<String>> incomingSummary = convertSummaryToDatasetMap(fieldLineageReader.getIncomingSummary(endPointField, start, end));
            // compute the field count for all incoming datasets
            incomingSummary.keySet().forEach(datasetId -> {
                fieldCount.computeIfAbsent(datasetId, missingDataset -> missingDataset == null ? 0 : fieldLineageReader.getFields(EndPoint.of(missingDataset.getNamespace(), missingDataset.getDataset()), start, end).size());
            });
            // here the field itself will be the destination
            computeAndAddRelations(incomingRelations, field, true, incomingSummary);
        }
        // compute the outgoing field level lineage
        if (direction == Constants.FieldLineage.Direction.OUTGOING || direction == Constants.FieldLineage.Direction.BOTH) {
            Map<DatasetId, Set<String>> outgoingSummary = convertSummaryToDatasetMap(fieldLineageReader.getOutgoingSummary(endPointField, start, end));
            // compute the field count for all outgoing datasets
            outgoingSummary.keySet().forEach(datasetId -> {
                fieldCount.computeIfAbsent(datasetId, missingDataset -> missingDataset == null ? 0 : fieldLineageReader.getFields(EndPoint.of(missingDataset.getNamespace(), missingDataset.getDataset()), start, end).size());
            });
            // here the field itself will be the source
            computeAndAddRelations(outgoingRelations, field, false, outgoingSummary);
        }
    }
    Set<String> noLineageFields = getFieldsWithNoFieldLineage(endPoint, lineageFields);
    Set<String> allFields = ImmutableSet.<String>builder().addAll(lineageFields).addAll(noLineageFields).build();
    return new DatasetFieldLineageSummary(direction, start, end, new DatasetId(endPoint.getNamespace(), endPoint.getName()), allFields, fieldCount, incomingRelations, outgoingRelations);
}
Also used : HashSet(java.util.HashSet) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) HashMap(java.util.HashMap) DatasetId(io.cdap.cdap.proto.id.DatasetId)

Aggregations

DatasetId (io.cdap.cdap.proto.id.DatasetId)370 Test (org.junit.Test)212 NamespaceId (io.cdap.cdap.proto.id.NamespaceId)70 Path (javax.ws.rs.Path)54 ProgramId (io.cdap.cdap.proto.id.ProgramId)42 TransactionExecutor (org.apache.tephra.TransactionExecutor)42 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)38 QueryResult (io.cdap.cdap.proto.QueryResult)36 HashMap (java.util.HashMap)34 HashSet (java.util.HashSet)34 ColumnDesc (io.cdap.cdap.proto.ColumnDesc)32 NamespaceMeta (io.cdap.cdap.proto.NamespaceMeta)32 Map (java.util.Map)32 DatasetSpecification (io.cdap.cdap.api.dataset.DatasetSpecification)30 PartitionedFileSet (io.cdap.cdap.api.dataset.lib.PartitionedFileSet)30 Set (java.util.Set)30 Table (io.cdap.cdap.api.dataset.table.Table)28 POST (javax.ws.rs.POST)28 TransactionFailureException (org.apache.tephra.TransactionFailureException)28 Location (org.apache.twill.filesystem.Location)28