Search in sources :

Example 26 with Lineage

use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageHttpHandlerTestRun method testAllProgramsLineage.

@Test
public void testAllProgramsLineage() throws Exception {
    NamespaceId namespace = new NamespaceId("testAllProgramsLineage");
    ApplicationId app = namespace.app(AllProgramsApp.NAME);
    ProgramId mapreduce = app.mr(AllProgramsApp.NoOpMR.NAME);
    ProgramId mapreduce2 = app.mr(AllProgramsApp.NoOpMR2.NAME);
    ProgramId spark = app.spark(AllProgramsApp.NoOpSpark.NAME);
    ProgramId service = app.service(AllProgramsApp.NoOpService.NAME);
    ProgramId worker = app.worker(AllProgramsApp.NoOpWorker.NAME);
    ProgramId workflow = app.workflow(AllProgramsApp.NoOpWorkflow.NAME);
    DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
    DatasetId dataset2 = namespace.dataset(AllProgramsApp.DATASET_NAME2);
    DatasetId dataset3 = namespace.dataset(AllProgramsApp.DATASET_NAME3);
    namespaceClient.create(new NamespaceMeta.Builder().setName(namespace.getNamespace()).build());
    try {
        appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
        // Add metadata
        ImmutableSet<String> sparkTags = ImmutableSet.of("spark-tag1", "spark-tag2");
        addTags(spark, sparkTags);
        Assert.assertEquals(sparkTags, getTags(spark, MetadataScope.USER));
        ImmutableSet<String> workerTags = ImmutableSet.of("worker-tag1");
        addTags(worker, workerTags);
        Assert.assertEquals(workerTags, getTags(worker, MetadataScope.USER));
        ImmutableMap<String, String> datasetProperties = ImmutableMap.of("data-key1", "data-value1");
        addProperties(dataset, datasetProperties);
        Assert.assertEquals(datasetProperties, getProperties(dataset, MetadataScope.USER));
        // Start all programs
        RunId mrRunId = runAndWait(mapreduce);
        RunId mrRunId2 = runAndWait(mapreduce2);
        RunId sparkRunId = runAndWait(spark);
        runAndWait(workflow);
        RunId workflowMrRunId = getRunId(mapreduce, mrRunId);
        RunId serviceRunId = runAndWait(service);
        // Worker makes a call to service to make it access datasets,
        // hence need to make sure service starts before worker, and stops after it.
        RunId workerRunId = runAndWait(worker);
        // Wait for programs to finish
        waitForStop(mapreduce, false);
        waitForStop(mapreduce2, false);
        waitForStop(spark, false);
        waitForStop(workflow, false);
        waitForStop(worker, false);
        waitForStop(service, true);
        long now = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis());
        long oneHour = TimeUnit.HOURS.toSeconds(1);
        // Fetch dataset lineage
        LineageRecord lineage = fetchLineage(dataset, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
        // dataset is accessed by all programs
        LineageRecord expected = LineageSerializer.toLineageRecord(now - oneHour, now + oneHour, new Lineage(ImmutableSet.of(// Dataset access
        new Relation(dataset, mapreduce, AccessType.WRITE, mrRunId), new Relation(dataset3, mapreduce, AccessType.READ, mrRunId), new Relation(dataset, mapreduce2, AccessType.WRITE, mrRunId2), new Relation(dataset2, mapreduce2, AccessType.READ, mrRunId2), new Relation(dataset, spark, AccessType.READ, sparkRunId), new Relation(dataset2, spark, AccessType.WRITE, sparkRunId), new Relation(dataset3, spark, AccessType.READ, sparkRunId), new Relation(dataset3, spark, AccessType.WRITE, sparkRunId), new Relation(dataset, mapreduce, AccessType.WRITE, workflowMrRunId), new Relation(dataset3, mapreduce, AccessType.READ, workflowMrRunId), new Relation(dataset, service, AccessType.WRITE, serviceRunId), new Relation(dataset, worker, AccessType.WRITE, workerRunId))), toSet(CollapseType.ACCESS));
        Assert.assertEquals(expected, lineage);
    } finally {
        namespaceClient.delete(namespace);
    }
}
Also used : Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) AllProgramsApp(io.cdap.cdap.client.app.AllProgramsApp) ProgramId(io.cdap.cdap.proto.id.ProgramId) DatasetId(io.cdap.cdap.proto.id.DatasetId) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) LineageRecord(io.cdap.cdap.proto.metadata.lineage.LineageRecord) NamespaceMeta(io.cdap.cdap.proto.NamespaceMeta) NamespaceId(io.cdap.cdap.proto.id.NamespaceId) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) RunId(org.apache.twill.api.RunId) Test(org.junit.Test)

Example 27 with Lineage

use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageWriterDatasetFramework method getDataset.

@Nullable
@Override
public <T extends Dataset> T getDataset(final DatasetId datasetInstanceId, final Map<String, String> arguments, @Nullable final ClassLoader classLoader, final DatasetClassLoaderProvider classLoaderProvider, @Nullable final Iterable<? extends EntityId> owners, final AccessType accessType) throws DatasetManagementException, IOException {
    Principal principal = authenticationContext.getPrincipal();
    try {
        // For system, skip authorization and lineage (user program shouldn't allow to access system dataset CDAP-6649)
        // For non-system dataset, always perform authorization and lineage.
        AccessEnforcer enforcer;
        DefaultDatasetRuntimeContext.DatasetAccessRecorder accessRecorder;
        if (!DatasetsUtil.isUserDataset(datasetInstanceId)) {
            enforcer = SYSTEM_NAMESPACE_ENFORCER;
            accessRecorder = SYSTEM_NAMESPACE_ACCESS_RECORDER;
        } else {
            enforcer = accessEnforcer;
            accessRecorder = new BasicDatasetAccessRecorder(datasetInstanceId, accessType, owners);
        }
        return DefaultDatasetRuntimeContext.execute(enforcer, accessRecorder, principal, datasetInstanceId, getConstructorDefaultAnnotation(accessType), () -> LineageWriterDatasetFramework.super.getDataset(datasetInstanceId, arguments, classLoader, classLoaderProvider, owners, accessType));
    } catch (IOException | DatasetManagementException | ServiceUnavailableException e) {
        throw e;
    } catch (Exception e) {
        throw new DatasetManagementException("Failed to create dataset instance: " + datasetInstanceId, e);
    }
}
Also used : DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) AccessEnforcer(io.cdap.cdap.security.spi.authorization.AccessEnforcer) IOException(java.io.IOException) ServiceUnavailableException(io.cdap.cdap.common.ServiceUnavailableException) DefaultDatasetRuntimeContext(io.cdap.cdap.data2.dataset2.DefaultDatasetRuntimeContext) Principal(io.cdap.cdap.proto.security.Principal) ServiceUnavailableException(io.cdap.cdap.common.ServiceUnavailableException) DatasetManagementException(io.cdap.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) Nullable(javax.annotation.Nullable)

Example 28 with Lineage

use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class DataStreamsTest method testLineageWithMacros.

@Test
public void testLineageWithMacros() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("key", Schema.of(Schema.Type.STRING)), Schema.Field.of("value", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("key", "key1").set("value", "value1").build(), StructuredRecord.builder(schema).set("key", "key2").set("value", "value2").build());
    String srcName = "lineageSource";
    String sinkName1 = "lineageOutput1";
    String sinkName2 = "lineageOutput2";
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 0L, srcName))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "identity").addConnection("identity", "sink").setCheckpointDir(checkpointDir).setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("lineageApp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    ProgramId spark = appId.spark(DataStreamsSparkLauncher.NAME);
    RunId runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName1);
    FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
    LineageAdmin lineageAdmin = getLineageAdmin();
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
    Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName1), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName1), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
    // here sleep for 1 seconds to start the second run because the dataset lineage is storing based on unit second
    TimeUnit.SECONDS.sleep(1);
    long startTimeMillis = System.currentTimeMillis();
    runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName2);
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
        long end = System.currentTimeMillis();
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, end);
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
    expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName2), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName2), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) FieldRelation(io.cdap.cdap.metadata.FieldRelation) Schema(io.cdap.cdap.api.data.schema.Schema) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) FieldRelation(io.cdap.cdap.metadata.FieldRelation) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) DatasetFieldLineageSummary(io.cdap.cdap.metadata.DatasetFieldLineageSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) Test(org.junit.Test)

Example 29 with Lineage

use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageHTTPHandler method datasetLineage.

/**
 * Get the lineage information about a dataset. This method does not give any information on field level lineage.
 *
 * @param startStr the start time string, it can be a specific timestamp in milliseconds or a relative time,
 *                 using now and times added to it.
 * @param endStr the end time string, it can be a specific timestamp in milliseconds or a relative time,
 *               using now and times added to it.
 * @param levels the level to compute the lineage
 * @param collapse indicate how to collapse the lineage result
 * @param rollup indicates whether to aggregate programs, currently supports rolling up programs into workflows
 */
@GET
@Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage")
public void datasetLineage(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @QueryParam("start") String startStr, @QueryParam("end") String endStr, @QueryParam("levels") @DefaultValue("10") int levels, @QueryParam("collapse") List<String> collapse, @QueryParam("rollup") String rollup) throws Exception {
    checkLevels(levels);
    TimeRange range = parseRange(startStr, endStr);
    DatasetId datasetInstance = new DatasetId(namespaceId, datasetId);
    accessEnforcer.enforce(datasetInstance, authenticationContext.getPrincipal(), StandardPermission.GET);
    Lineage lineage = lineageAdmin.computeLineage(datasetInstance, range.getStart(), range.getEnd(), levels, rollup);
    responder.sendJson(HttpResponseStatus.OK, GSON.toJson(LineageSerializer.toLineageRecord(TimeUnit.MILLISECONDS.toSeconds(range.getStart()), TimeUnit.MILLISECONDS.toSeconds(range.getEnd()), lineage, getCollapseTypes(collapse)), LineageRecord.class));
}
Also used : LineageRecord(io.cdap.cdap.proto.metadata.lineage.LineageRecord) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) DatasetId(io.cdap.cdap.proto.id.DatasetId) Path(javax.ws.rs.Path) GET(javax.ws.rs.GET)

Example 30 with Lineage

use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.

the class LineageHTTPHandler method datasetFields.

/**
 * Get the fields in the dataset. This method can be used to get fields that have lineage or all the fields in the
 * dataset.
 *
 * @param startStr the start time string, it can be a specific timestamp in milliseconds or a relative time,
 *                 using now and times added to it.
 * @param endStr the end time string, it can be a specific timestamp in milliseconds or a relative time,
 *               using now and times added to it.
 * @param prefix the prefix of the field, if not provided, all fields will get return
 * @param includeCurrent a boolean to indicate whether to include fields that do not have field level lineage
 */
@GET
@Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields")
public void datasetFields(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @QueryParam("start") String startStr, @QueryParam("end") String endStr, @QueryParam("prefix") String prefix, @QueryParam("includeCurrent") boolean includeCurrent) throws Exception {
    accessEnforcer.enforce(new DatasetId(namespaceId, datasetId), authenticationContext.getPrincipal(), StandardPermission.GET);
    TimeRange range = parseRange(startStr, endStr);
    Set<Field> result = fieldLineageAdmin.getFields(EndPoint.of(namespaceId, datasetId), range.getStart(), range.getEnd(), prefix, includeCurrent);
    // Field object.
    if (includeCurrent) {
        responder.sendJson(HttpResponseStatus.OK, GSON.toJson(result));
    } else {
        responder.sendJson(HttpResponseStatus.OK, GSON.toJson(result.stream().map(Field::getName).collect(Collectors.toSet())));
    }
}
Also used : EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) Field(io.cdap.cdap.proto.metadata.lineage.Field) DatasetId(io.cdap.cdap.proto.id.DatasetId) Path(javax.ws.rs.Path) GET(javax.ws.rs.GET)

Aggregations

Test (org.junit.Test)22 Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)13 Lineage (co.cask.cdap.data2.metadata.lineage.Lineage)12 Relation (io.cdap.cdap.data2.metadata.lineage.Relation)12 DatasetId (io.cdap.cdap.proto.id.DatasetId)12 LineageStoreReader (io.cdap.cdap.data2.metadata.lineage.LineageStoreReader)11 LineageWriter (io.cdap.cdap.data2.metadata.writer.LineageWriter)11 Relation (co.cask.cdap.data2.metadata.lineage.Relation)10 Store (io.cdap.cdap.app.store.Store)9 DefaultLineageStoreReader (io.cdap.cdap.data2.metadata.lineage.DefaultLineageStoreReader)9 TransactionRunner (io.cdap.cdap.spi.data.transaction.TransactionRunner)9 EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)8 BasicLineageWriter (io.cdap.cdap.data2.metadata.writer.BasicLineageWriter)8 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)8 Store (co.cask.cdap.app.store.Store)7 LineageStore (co.cask.cdap.data2.metadata.lineage.LineageStore)7 MetadataStore (co.cask.cdap.data2.metadata.store.MetadataStore)7 ProgramId (io.cdap.cdap.proto.id.ProgramId)7 HashSet (java.util.HashSet)7 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)6