use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class UpgradeTool method createInjector.
@VisibleForTesting
Injector createInjector() {
return Guice.createInjector(new ConfigModule(cConf, hConf), RemoteAuthenticatorModules.getDefaultModule(), new DFSLocationModule(), new ZKClientModule(), new ZKDiscoveryModule(), new MessagingClientModule(), Modules.override(new DataSetsModules().getDistributedModules()).with(new AbstractModule() {
@Override
protected void configure() {
bind(DatasetFramework.class).to(InMemoryDatasetFramework.class).in(Scopes.SINGLETON);
// the DataSetsModules().getDistributedModules() binds to RemoteDatasetFramework so override that to
// the same InMemoryDatasetFramework
bind(DatasetFramework.class).annotatedWith(Names.named(DataSetsModules.BASE_DATASET_FRAMEWORK)).to(DatasetFramework.class);
bind(DatasetDefinitionRegistryFactory.class).to(DefaultDatasetDefinitionRegistryFactory.class).in(Scopes.SINGLETON);
// CDAP-5954 Upgrade tool does not need to record lineage and metadata changes for now.
bind(LineageWriter.class).to(NoOpLineageWriter.class);
bind(FieldLineageWriter.class).to(NoOpLineageWriter.class);
}
}), new TwillModule(), new ExploreClientModule(), new ProgramRunnerRuntimeModule().getDistributedModules(), new SystemDatasetRuntimeModule().getDistributedModules(), new KafkaClientModule(), new IOModule(), CoreSecurityRuntimeModule.getDistributedModule(cConf), new AuthenticationContextModules().getMasterModule(), new AuthorizationModule(), new AuthorizationEnforcementModule().getMasterModule(), new SecureStoreServerModule(), new DataFabricModules(UpgradeTool.class.getName()).getDistributedModules(), new AppFabricServiceRuntimeModule(cConf).getDistributedModules(), new KafkaLogAppenderModule(), // the DataFabricDistributedModule needs MetricsCollectionService binding
new AbstractModule() {
@Override
protected void configure() {
// Since Upgrade tool does not do anything with Metrics we just bind it to no-op implementations
bind(MetricsCollectionService.class).toInstance(new NoOpMetricsCollectionService());
bind(MetricsSystemClient.class).toInstance(new NoOpMetricsSystemClient());
}
@Provides
@Singleton
@Named("datasetInstanceManager")
@SuppressWarnings("unused")
public DatasetInstanceManager getDatasetInstanceManager(TransactionRunner transactionRunner) {
return new DatasetInstanceManager(transactionRunner);
}
// This is needed because the LocalApplicationManager
// expects a dsframework injection named datasetMDS
@Provides
@Singleton
@Named("datasetMDS")
@SuppressWarnings("unused")
public DatasetFramework getInDsFramework(DatasetFramework dsFramework) {
return dsFramework;
}
}, new AbstractModule() {
@Override
protected void configure() {
// TODO (CDAP-14677): find a better way to inject metadata publisher
bind(MetadataServiceClient.class).to(NoOpMetadataServiceClient.class);
}
});
}
use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class SparkRuntimeService method destroy.
/**
* Calls the destroy or onFinish method of {@link ProgramLifecycle}.
*/
private void destroy(final ProgramState state) {
context.setState(state);
TransactionControl defaultTxControl = runtimeContext.getDefaultTxControl();
TransactionControl txControl = spark instanceof ProgramLifecycle ? Transactions.getTransactionControl(defaultTxControl, Spark.class, spark, "destroy") : defaultTxControl;
runtimeContext.destroyProgram(programLifecycle, txControl, false);
if (emitFieldLineage()) {
try {
// here we cannot call context.flushRecord() since the WorkflowNodeState will need to record and store
// the lineage information
FieldLineageInfo info = new FieldLineageInfo(runtimeContext.getFieldLineageOperations());
fieldLineageWriter.write(runtimeContext.getProgramRunId(), info);
} catch (Throwable t) {
LOG.warn("Failed to emit the field lineage operations for Spark {}", runtimeContext.getProgramRunId(), t);
}
}
}
use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageTestRun method testFlowLineage.
@Test
public void testFlowLineage() throws Exception {
NamespaceId namespace = new NamespaceId("testFlowLineage");
ApplicationId app = namespace.app(AllProgramsApp.NAME);
ProgramId flow = app.flow(AllProgramsApp.NoOpFlow.NAME);
DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
StreamId stream = namespace.stream(AllProgramsApp.STREAM_NAME);
namespaceClient.create(new NamespaceMeta.Builder().setName(namespace).build());
try {
appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
// Add metadata to applicaton
ImmutableMap<String, String> appProperties = ImmutableMap.of("app-key1", "app-value1");
addProperties(app, appProperties);
Assert.assertEquals(appProperties, getProperties(app, MetadataScope.USER));
ImmutableSet<String> appTags = ImmutableSet.of("app-tag1");
addTags(app, appTags);
Assert.assertEquals(appTags, getTags(app, MetadataScope.USER));
// Add metadata to flow
ImmutableMap<String, String> flowProperties = ImmutableMap.of("flow-key1", "flow-value1");
addProperties(flow, flowProperties);
Assert.assertEquals(flowProperties, getProperties(flow, MetadataScope.USER));
ImmutableSet<String> flowTags = ImmutableSet.of("flow-tag1", "flow-tag2");
addTags(flow, flowTags);
Assert.assertEquals(flowTags, getTags(flow, MetadataScope.USER));
// Add metadata to dataset
ImmutableMap<String, String> dataProperties = ImmutableMap.of("data-key1", "data-value1");
addProperties(dataset, dataProperties);
Assert.assertEquals(dataProperties, getProperties(dataset, MetadataScope.USER));
ImmutableSet<String> dataTags = ImmutableSet.of("data-tag1", "data-tag2");
addTags(dataset, dataTags);
Assert.assertEquals(dataTags, getTags(dataset, MetadataScope.USER));
// Add metadata to stream
ImmutableMap<String, String> streamProperties = ImmutableMap.of("stream-key1", "stream-value1");
addProperties(stream, streamProperties);
Assert.assertEquals(streamProperties, getProperties(stream, MetadataScope.USER));
ImmutableSet<String> streamTags = ImmutableSet.of("stream-tag1", "stream-tag2");
addTags(stream, streamTags);
Assert.assertEquals(streamTags, getTags(stream, MetadataScope.USER));
long startTime = TimeMathParser.nowInSeconds();
RunId flowRunId = runAndWait(flow);
// Wait for few seconds so that the stop time secs is more than start time secs.
TimeUnit.SECONDS.sleep(2);
waitForStop(flow, true);
long stopTime = TimeMathParser.nowInSeconds();
// Fetch dataset lineage
LineageRecord lineage = fetchLineage(dataset, startTime, stopTime, 10);
LineageRecord expected = LineageSerializer.toLineageRecord(startTime, stopTime, new Lineage(ImmutableSet.of(new Relation(dataset, flow, AccessType.UNKNOWN, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(stream, flow, AccessType.READ, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))))), Collections.<CollapseType>emptySet());
Assert.assertEquals(expected, lineage);
// Fetch dataset lineage with time strings
lineage = fetchLineage(dataset, "now-1h", "now+1h", 10);
Assert.assertEquals(expected.getRelations(), lineage.getRelations());
// Fetch stream lineage
lineage = fetchLineage(stream, startTime, stopTime, 10);
// same as dataset's lineage
Assert.assertEquals(expected, lineage);
// Fetch stream lineage with time strings
lineage = fetchLineage(stream, "now-1h", "now+1h", 10);
// same as dataset's lineage
Assert.assertEquals(expected.getRelations(), lineage.getRelations());
// Assert metadata
// Id.Flow needs conversion to Id.Program JIRA - CDAP-3658
Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, appProperties, appTags), new MetadataRecord(flow, MetadataScope.USER, flowProperties, flowTags), new MetadataRecord(dataset, MetadataScope.USER, dataProperties, dataTags), new MetadataRecord(stream, MetadataScope.USER, streamProperties, streamTags)), fetchRunMetadata(flow.run(flowRunId.getId())));
// Assert with a time range after the flow run should return no results
long laterStartTime = stopTime + 1000;
long laterEndTime = stopTime + 5000;
// Fetch stream lineage
lineage = fetchLineage(stream, laterStartTime, laterEndTime, 10);
Assert.assertEquals(LineageSerializer.toLineageRecord(laterStartTime, laterEndTime, new Lineage(ImmutableSet.<Relation>of()), Collections.<CollapseType>emptySet()), lineage);
// Assert with a time range before the flow run should return no results
long earlierStartTime = startTime - 5000;
long earlierEndTime = startTime - 1000;
// Fetch stream lineage
lineage = fetchLineage(stream, earlierStartTime, earlierEndTime, 10);
Assert.assertEquals(LineageSerializer.toLineageRecord(earlierStartTime, earlierEndTime, new Lineage(ImmutableSet.<Relation>of()), Collections.<CollapseType>emptySet()), lineage);
// Test bad time ranges
fetchLineage(dataset, "sometime", "sometime", 10, BadRequestException.class);
fetchLineage(dataset, "now+1h", "now-1h", 10, BadRequestException.class);
// Test non-existent run
assertRunMetadataNotFound(flow.run(RunIds.generate(1000).getId()));
} finally {
namespaceClient.delete(namespace);
}
}
use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageTestRun method testAllProgramsLineage.
@Test
public void testAllProgramsLineage() throws Exception {
NamespaceId namespace = new NamespaceId("testAllProgramsLineage");
ApplicationId app = namespace.app(AllProgramsApp.NAME);
ProgramId flow = app.flow(AllProgramsApp.NoOpFlow.NAME);
ProgramId mapreduce = app.mr(AllProgramsApp.NoOpMR.NAME);
ProgramId mapreduce2 = app.mr(AllProgramsApp.NoOpMR2.NAME);
ProgramId spark = app.spark(AllProgramsApp.NoOpSpark.NAME);
ProgramId service = app.service(AllProgramsApp.NoOpService.NAME);
ProgramId worker = app.worker(AllProgramsApp.NoOpWorker.NAME);
ProgramId workflow = app.workflow(AllProgramsApp.NoOpWorkflow.NAME);
DatasetId dataset = namespace.dataset(AllProgramsApp.DATASET_NAME);
DatasetId dataset2 = namespace.dataset(AllProgramsApp.DATASET_NAME2);
DatasetId dataset3 = namespace.dataset(AllProgramsApp.DATASET_NAME3);
StreamId stream = namespace.stream(AllProgramsApp.STREAM_NAME);
namespaceClient.create(new NamespaceMeta.Builder().setName(namespace.getNamespace()).build());
try {
appClient.deploy(namespace, createAppJarFile(AllProgramsApp.class));
// Add metadata
ImmutableSet<String> sparkTags = ImmutableSet.of("spark-tag1", "spark-tag2");
addTags(spark, sparkTags);
Assert.assertEquals(sparkTags, getTags(spark, MetadataScope.USER));
ImmutableSet<String> workerTags = ImmutableSet.of("worker-tag1");
addTags(worker, workerTags);
Assert.assertEquals(workerTags, getTags(worker, MetadataScope.USER));
ImmutableMap<String, String> datasetProperties = ImmutableMap.of("data-key1", "data-value1");
addProperties(dataset, datasetProperties);
Assert.assertEquals(datasetProperties, getProperties(dataset, MetadataScope.USER));
// Start all programs
RunId flowRunId = runAndWait(flow);
RunId mrRunId = runAndWait(mapreduce);
RunId mrRunId2 = runAndWait(mapreduce2);
RunId sparkRunId = runAndWait(spark);
runAndWait(workflow);
RunId workflowMrRunId = getRunId(mapreduce, mrRunId);
RunId serviceRunId = runAndWait(service);
// Worker makes a call to service to make it access datasets,
// hence need to make sure service starts before worker, and stops after it.
RunId workerRunId = runAndWait(worker);
// Wait for programs to finish
waitForStop(flow, true);
waitForStop(mapreduce, false);
waitForStop(mapreduce2, false);
waitForStop(spark, false);
waitForStop(workflow, false);
waitForStop(worker, false);
waitForStop(service, true);
long now = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis());
long oneHour = TimeUnit.HOURS.toSeconds(1);
// Fetch dataset lineage
LineageRecord lineage = fetchLineage(dataset, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
// dataset is accessed by all programs
LineageRecord expected = LineageSerializer.toLineageRecord(now - oneHour, now + oneHour, new Lineage(ImmutableSet.of(// Dataset access
new Relation(dataset, flow, AccessType.UNKNOWN, flowRunId, toSet(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(dataset, mapreduce, AccessType.WRITE, mrRunId), new Relation(dataset, mapreduce2, AccessType.WRITE, mrRunId2), new Relation(dataset2, mapreduce2, AccessType.READ, mrRunId2), new Relation(dataset, spark, AccessType.READ, sparkRunId), new Relation(dataset2, spark, AccessType.WRITE, sparkRunId), new Relation(dataset3, spark, AccessType.READ, sparkRunId), new Relation(dataset3, spark, AccessType.WRITE, sparkRunId), new Relation(dataset, mapreduce, AccessType.WRITE, workflowMrRunId), new Relation(dataset, service, AccessType.WRITE, serviceRunId), new Relation(dataset, worker, AccessType.WRITE, workerRunId), // Stream access
new Relation(stream, flow, AccessType.READ, flowRunId, ImmutableSet.of(flow.flowlet(AllProgramsApp.A.NAME))), new Relation(stream, mapreduce, AccessType.READ, mrRunId), new Relation(stream, spark, AccessType.READ, sparkRunId), new Relation(stream, mapreduce, AccessType.READ, workflowMrRunId), new Relation(stream, worker, AccessType.WRITE, workerRunId))), toSet(CollapseType.ACCESS));
Assert.assertEquals(expected, lineage);
// Fetch stream lineage
lineage = fetchLineage(stream, now - oneHour, now + oneHour, toSet(CollapseType.ACCESS), 10);
// stream too is accessed by all programs
Assert.assertEquals(expected, lineage);
// Assert metadata
// Id.Flow needs conversion to Id.Program JIRA - CDAP-3658
Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(flow, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(flow.run(flowRunId.getId())));
// Id.Worker needs conversion to Id.Program JIRA - CDAP-3658
ProgramId programForWorker = new ProgramId(worker.getNamespace(), worker.getApplication(), worker.getType(), worker.getEntityName());
Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(programForWorker, MetadataScope.USER, emptyMap(), workerTags), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(worker.run(workerRunId.getId())));
// Id.Spark needs conversion to Id.Program JIRA - CDAP-3658
ProgramId programForSpark = new ProgramId(spark.getNamespace(), spark.getApplication(), spark.getType(), spark.getEntityName());
Assert.assertEquals(toSet(new MetadataRecord(app, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(programForSpark, MetadataScope.USER, emptyMap(), sparkTags), new MetadataRecord(dataset, MetadataScope.USER, datasetProperties, emptySet()), new MetadataRecord(dataset2, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(dataset3, MetadataScope.USER, emptyMap(), emptySet()), new MetadataRecord(stream, MetadataScope.USER, emptyMap(), emptySet())), fetchRunMetadata(spark.run(sparkRunId.getId())));
} finally {
namespaceClient.delete(namespace);
}
}
use of io.cdap.cdap.data2.metadata.lineage.Lineage in project cdap by caskdata.
the class LineageAdmin method doComputeLineage.
private Lineage doComputeLineage(final NamespacedEntityId sourceData, long startMillis, long endMillis, int levels, @Nullable String rollup) throws NotFoundException {
LOG.trace("Computing lineage for data {}, startMillis {}, endMillis {}, levels {}", sourceData, startMillis, endMillis, levels);
// Convert start time and end time period into scan keys in terms of program start times.
Set<RunId> runningInRange = store.getRunningInRange(TimeUnit.MILLISECONDS.toSeconds(startMillis), TimeUnit.MILLISECONDS.toSeconds(endMillis));
if (LOG.isTraceEnabled()) {
LOG.trace("Got {} rundIds in time range ({}, {})", runningInRange.size(), startMillis, endMillis);
}
ScanRangeWithFilter scanRange = getScanRange(runningInRange);
LOG.trace("Using scan start = {}, scan end = {}", scanRange.getStart(), scanRange.getEnd());
Multimap<RelationKey, Relation> relations = HashMultimap.create();
Set<NamespacedEntityId> visitedDatasets = new HashSet<>();
Set<NamespacedEntityId> toVisitDatasets = new HashSet<>();
Set<ProgramId> visitedPrograms = new HashSet<>();
Set<ProgramId> toVisitPrograms = new HashSet<>();
toVisitDatasets.add(sourceData);
for (int i = 0; i < levels; ++i) {
LOG.trace("Level {}", i);
toVisitPrograms.clear();
for (NamespacedEntityId d : toVisitDatasets) {
if (visitedDatasets.add(d)) {
LOG.trace("Visiting dataset {}", d);
// Fetch related programs
Iterable<Relation> programRelations = getProgramRelations(d, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
LOG.trace("Got program relations {}", programRelations);
for (Relation relation : programRelations) {
relations.put(new RelationKey(relation), relation);
}
Iterables.addAll(toVisitPrograms, Iterables.transform(programRelations, RELATION_TO_PROGRAM_FUNCTION));
}
}
toVisitDatasets.clear();
for (ProgramId p : toVisitPrograms) {
if (visitedPrograms.add(p)) {
LOG.trace("Visiting program {}", p);
// Fetch related datasets
Iterable<Relation> datasetRelations = lineageStoreReader.getRelations(p, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
LOG.trace("Got data relations {}", datasetRelations);
for (Relation relation : datasetRelations) {
relations.put(new RelationKey(relation), relation);
}
Iterables.addAll(toVisitDatasets, Iterables.transform(datasetRelations, RELATION_TO_DATA_FUNCTION));
}
}
}
if (rollup != null && rollup.contains("workflow")) {
relations = doComputeRollupLineage(relations);
}
Lineage lineage = new Lineage(Iterables.concat(Maps.transformValues(relations.asMap(), COLLAPSE_UNKNOWN_TYPE_FUNCTION).values()));
LOG.trace("Got lineage {}", lineage);
return lineage;
}
Aggregations