use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageTestRun method testAllProgramsLineage.
@Test
public void testAllProgramsLineage() throws Exception {
NamespaceId namespace = new NamespaceId("testAllProgramsLineage");
ApplicationId app = namespace.app(AllProgramsApp.NAME);
ProgramId flow = app.flow(AllProgramsApp.NoOpFlow.NAME);
ProgramId mapreduce = app.mr(AllProgramsApp.NoOpMR.NAME);
ProgramId mapreduce2 = app.mr(AllProgramsApp.NoOpMR2.NAME);
ProgramId spark = app.spark(AllProgramsApp.NoOpSpark.NAME);
ProgramId service = app.service(AllProgramsApp.NoOpService.NAME);
ProgramId worker = app.worker(AllProgramsApp.NoOpWorker.NAME);
ProgramId workflow = app.workflow(AllProgramsApp.NoOpWorkflow.NAME);
DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
DatasetId dataset2 = namespace.dataset(AllProgramsApp.DATASET_NAME2);
DatasetId dataset3 = namespace.dataset(AllProgramsApp.DATASET_NAME3);
StreamId stream = namespace.stream(AllProgramsApp.STREAM_NAME);
namespaceClient.create(new NamespaceMeta.Builder().setName(namespace.getNamespace()).build());
try {
appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
// Add metadata
ImmutableSet<String> sparkTags = ImmutableSet.of("spark-tag1", "spark-tag2");
addTags(spark, sparkTags);
Assert.assertEquals(sparkTags, getTags(spark, MetadataScope.USER));
ImmutableSet<String> workerTags = ImmutableSet.of("worker-tag1");
addTags(worker, workerTags);
Assert.assertEquals(workerTags, getTags(worker, MetadataScope.USER));
ImmutableMap<String, String> datasetProperties = ImmutableMap.of("data-key1", "data-value1");
addProperties(dataset, datasetProperties);
Assert.assertEquals(datasetProperties, getProperties(dataset, MetadataScope.USER));
// Start all programs
RunId flowRunId = runAndWait(flow);
RunId mrRunId = runAndWait(mapreduce);
RunId mrRunId2 = runAndWait(mapreduce2);
RunId sparkRunId = runAndWait(spark);
runAndWait(workflow);
RunId workflowMrRunId = getRunId(mapreduce, mrRunId);
RunId serviceRunId = runAndWait(service);
// Worker makes a call to service to make it access datasets,
// hence need to make sure service starts before worker, and stops after it.
RunId workerRunId = runAndWait(worker);
// Wait for programs to finish
waitForStop(flow, true);
waitForStop(mapreduce, false);
waitForStop(mapreduce2, false);
waitForStop(spark, false);
waitForStop(workflow, false);
waitForStop(worker, false);
waitForStop(service, true);
long now = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis());
long oneHour = TimeUnit.HOURS.toSeconds(1);
// Fetch dataset lineage
LineageRecord lineage = fetchLineage(dataset, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
// dataset is accessed by all programs
LineageRecord expected = LineageSerializer.toLineageRecord(now - oneHour, now + oneHour, new Lineage(ImmutableSet.of(// Dataset access
new Relation(dataset, flow, AccessType.UNKNOWN, flowRunId, toSet(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(dataset, mapreduce, AccessType.WRITE, mrRunId), new Relation(dataset, mapreduce2, AccessType.WRITE, mrRunId2), new Relation(dataset2, mapreduce2, AccessType.READ, mrRunId2), new Relation(dataset, spark, AccessType.READ, sparkRunId), new Relation(dataset2, spark, AccessType.WRITE, sparkRunId), new Relation(dataset3, spark, AccessType.READ, sparkRunId), new Relation(dataset3, spark, AccessType.WRITE, sparkRunId), new Relation(dataset, mapreduce, AccessType.WRITE, workflowMrRunId), new Relation(dataset, service, AccessType.WRITE, serviceRunId), new Relation(dataset, worker, AccessType.WRITE, workerRunId), // Stream access
new Relation(stream, flow, AccessType.READ, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(stream, mapreduce, AccessType.READ, mrRunId), new Relation(stream, spark, AccessType.READ, sparkRunId), new Relation(stream, mapreduce, AccessType.READ, workflowMrRunId), new Relation(stream, worker, AccessType.WRITE, workerRunId))), toSet(CollapseType.ACCESS));
Assert.assertEquals(expected, lineage);
// Fetch stream lineage
lineage = fetchLineage(stream, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
// stream too is accessed by all programs
Assert.assertEquals(expected, lineage);
// Assert metadata
// Id.Flow needs conversion to Id.Program JIRA - CDAP-3658
Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(flow, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(flow.run(flowRunId.getId())));
// Id.Worker needs conversion to Id.Program JIRA - CDAP-3658
ProgramId programForWorker = new ProgramId(worker.getNamespace(), worker.getApplication(), worker.getType(), worker.getEntityName());
Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(programForWorker, MetadataScope.USER, emptyMap(), workerTags), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(worker.run(workerRunId.getId())));
// Id.Spark needs conversion to Id.Program JIRA - CDAP-3658
ProgramId programForSpark = new ProgramId(spark.getNamespace(), spark.getApplication(), spark.getType(), spark.getEntityName());
Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(programForSpark, MetadataScope.USER, emptyMap(), sparkTags), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(dataset2, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(dataset3, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(spark.run(sparkRunId.getId())));
} finally {
namespaceClient.delete(namespace);
}
}
use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class SystemDatasetRuntimeModule method bindDefaultModules.
/**
* Add bindings for Dataset modules that are available by default
*/
private void bindDefaultModules(MapBinder<String, DatasetModule> mapBinder) {
mapBinder.addBinding("core").toInstance(new CoreDatasetsModule());
mapBinder.addBinding("fileSet").toInstance(new FileSetModule());
mapBinder.addBinding("timePartitionedFileSet").toInstance(new TimePartitionedFileSetModule());
mapBinder.addBinding("partitionedFileSet").toInstance(new PartitionedFileSetModule());
mapBinder.addBinding("objectMappedTable").toInstance(new ObjectMappedTableModule());
mapBinder.addBinding("cube").toInstance(new CubeModule());
mapBinder.addBinding("usage").toInstance(new UsageDatasetModule());
mapBinder.addBinding("metadata").toInstance(new MetadataDatasetModule());
mapBinder.addBinding("lineage").toInstance(new LineageDatasetModule());
mapBinder.addBinding("externalDataset").toInstance(new ExternalDatasetModule());
}
use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageWriterDatasetFramework method getDataset.
@Nullable
@Override
public <T extends Dataset> T getDataset(final DatasetId datasetInstanceId, final Map<String, String> arguments, @Nullable final ClassLoader classLoader, final DatasetClassLoaderProvider classLoaderProvider, @Nullable final Iterable<? extends EntityId> owners, final AccessType accessType) throws DatasetManagementException, IOException {
Principal principal = authenticationContext.getPrincipal();
try {
// For system, skip authorization and lineage (user program shouldn't allow to access system dataset CDAP-6649)
// For non-system dataset, always perform authorization and lineage.
AuthorizationEnforcer enforcer;
DefaultDatasetRuntimeContext.DatasetAccessRecorder accessRecorder;
if (!DatasetsUtil.isUserDataset(datasetInstanceId)) {
enforcer = SYSTEM_NAMESPACE_ENFORCER;
accessRecorder = SYSTEM_NAMESPACE_ACCESS_RECORDER;
} else {
enforcer = authorizationEnforcer;
accessRecorder = new BasicDatasetAccessRecorder(datasetInstanceId, accessType, owners);
}
return DefaultDatasetRuntimeContext.execute(enforcer, accessRecorder, principal, datasetInstanceId, getConstructorDefaultAnnotation(accessType), new Callable<T>() {
@Override
public T call() throws Exception {
return LineageWriterDatasetFramework.super.getDataset(datasetInstanceId, arguments, classLoader, classLoaderProvider, owners, accessType);
}
});
} catch (IOException | DatasetManagementException | ServiceUnavailableException e) {
throw e;
} catch (Exception e) {
throw new DatasetManagementException("Failed to create dataset instance: " + datasetInstanceId, e);
}
}
use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class UpgradeTool method createInjector.
@VisibleForTesting
Injector createInjector() throws Exception {
return Guice.createInjector(new ConfigModule(cConf, hConf), new LocationRuntimeModule().getDistributedModules(), new ZKClientModule(), new DiscoveryRuntimeModule().getDistributedModules(), new MessagingClientModule(), Modules.override(new DataSetsModules().getDistributedModules()).with(new AbstractModule() {
@Override
protected void configure() {
bind(DatasetFramework.class).to(InMemoryDatasetFramework.class).in(Scopes.SINGLETON);
// the DataSetsModules().getDistributedModules() binds to RemoteDatasetFramework so override that to
// the same InMemoryDatasetFramework
bind(DatasetFramework.class).annotatedWith(Names.named(DataSetsModules.BASE_DATASET_FRAMEWORK)).to(DatasetFramework.class);
install(new FactoryModuleBuilder().implement(DatasetDefinitionRegistry.class, DefaultDatasetDefinitionRegistry.class).build(DatasetDefinitionRegistryFactory.class));
// CDAP-5954 Upgrade tool does not need to record lineage and metadata changes for now.
bind(LineageWriter.class).to(NoOpLineageWriter.class);
}
}), new ViewAdminModules().getDistributedModules(), new StreamAdminModules().getDistributedModules(), new NotificationFeedClientModule(), new TwillModule(), new ExploreClientModule(), new ProgramRunnerRuntimeModule().getDistributedModules(), new ServiceStoreModules().getDistributedModules(), new SystemDatasetRuntimeModule().getDistributedModules(), // don't need real notifications for upgrade, so use the in-memory implementations
new NotificationServiceRuntimeModule().getInMemoryModules(), new KafkaClientModule(), new NamespaceStoreModule().getDistributedModules(), new AuthenticationContextModules().getMasterModule(), new AuthorizationModule(), new AuthorizationEnforcementModule().getMasterModule(), new SecureStoreModules().getDistributedModules(), new DataFabricModules(UpgradeTool.class.getName()).getDistributedModules(), new AppFabricServiceRuntimeModule().getDistributedModules(), new AbstractModule() {
@Override
protected void configure() {
// the DataFabricDistributedModule needs MetricsCollectionService binding and since Upgrade tool does not do
// anything with Metrics we just bind it to NoOpMetricsCollectionService
bind(MetricsCollectionService.class).to(NoOpMetricsCollectionService.class).in(Scopes.SINGLETON);
bind(MetricDatasetFactory.class).to(DefaultMetricDatasetFactory.class).in(Scopes.SINGLETON);
bind(MetricStore.class).to(DefaultMetricStore.class);
}
@Provides
@Singleton
@Named("datasetInstanceManager")
@SuppressWarnings("unused")
public DatasetInstanceManager getDatasetInstanceManager(TransactionSystemClientService txClient, TransactionExecutorFactory txExecutorFactory, @Named("datasetMDS") DatasetFramework framework) {
return new DatasetInstanceManager(txClient, txExecutorFactory, framework);
}
// This is needed because the LocalApplicationManager
// expects a dsframework injection named datasetMDS
@Provides
@Singleton
@Named("datasetMDS")
@SuppressWarnings("unused")
public DatasetFramework getInDsFramework(DatasetFramework dsFramework) {
return dsFramework;
}
});
}
use of co.cask.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageAdmin method getScanRange.
/**
* Convert a set of runIds into a scan range based on earliest runtime and latest runtime of runIds.
* Also, add a scan filter to include only runIds in the given set.
* @param runIds input runIds set
* @return scan range
*/
@VisibleForTesting
static ScanRangeWithFilter getScanRange(final Set<RunId> runIds) {
if (runIds.isEmpty()) {
return new ScanRangeWithFilter(0, 0, Predicates.<Relation>alwaysFalse());
}
// Pick the earliest start time and latest start time for lineage range
long earliest = Long.MAX_VALUE;
long latest = 0;
for (RunId runId : runIds) {
long runStartTime = RunIds.getTime(runId, TimeUnit.MILLISECONDS);
if (runStartTime < earliest) {
earliest = runStartTime;
}
if (runStartTime > latest) {
latest = runStartTime;
}
}
// scan end key is exclusive, so need to add 1 to to include the last runid
return new ScanRangeWithFilter(earliest, latest + 1, new Predicate<Relation>() {
@Override
public boolean apply(Relation input) {
return runIds.contains(input.getRun());
}
});
}
Aggregations