use of io.cdap.cdap.metadata.FieldLineageAdmin in project cdap by caskdata.
the class DataPipelineTest method testActionFieldLineage.
private void testActionFieldLineage(Engine engine) throws Exception {
String readDataset = "ActionReadDataset" + engine;
String writeDataset = "ActionWriteDataset" + engine;
List<String> srcFields = ImmutableList.of("srcField1", "srcField2", "srcField3");
Set<String> destFields = ImmutableSet.of("destField1", "destField2", "destField3");
List<Operation> operations = new ArrayList<>();
/*
* |---------> srcField1 -> destField1----|
* | |
* ActionReadDataset -> srcField2 -> destField2 ---|-> ActionWriteDataset
* | |
* |---------> srcField3 -> destField3 ---|
*/
operations.add(new ReadOperation("Read", "1st operation", EndPoint.of("default", readDataset), srcFields));
operations.add(new TransformOperation("Transform1", "2nd operation", Collections.singletonList(InputField.of("Read", "srcField1")), "destField1"));
operations.add(new TransformOperation("Transform2", "3rd operation", Collections.singletonList(InputField.of("Read", "srcField2")), "destField2"));
operations.add(new TransformOperation("Transform3", "4th operation", Collections.singletonList(InputField.of("Read", "srcField3")), "destField3"));
operations.add(new WriteOperation("Write", "5th operation", EndPoint.of("default", writeDataset), ImmutableList.of(InputField.of("Transform1", "destField1"), InputField.of("Transform2", "destField2"), InputField.of("Transform3", "destField3"))));
ETLStage action = new ETLStage("action", FieldLineageAction.getPlugin(readDataset, writeDataset, operations));
ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(action).setEngine(engine).build();
AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationId appId = NamespaceId.DEFAULT.app("ActionFieldLineage-" + engine);
ApplicationManager appManager = deployApplication(appId, appRequest);
WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
// get field lineage for dest dataset
DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", writeDataset), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(writeDataset), summary.getDatasetId());
Assert.assertEquals(destFields, summary.getFields());
Assert.assertTrue(summary.getOutgoing().isEmpty());
Assert.assertEquals(1, summary.getIncoming().size());
Set<FieldRelation> fieldRelations = ImmutableSet.of(new FieldRelation("srcField1", "destField1"), new FieldRelation("srcField2", "destField2"), new FieldRelation("srcField3", "destField3"));
DatasetFieldLineageSummary.FieldLineageRelations expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(readDataset), 3, fieldRelations);
Assert.assertEquals(expectedRelations, summary.getIncoming().iterator().next());
// get field lineage for src dataset
summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", readDataset), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(readDataset), summary.getDatasetId());
Assert.assertEquals(new HashSet<>(srcFields), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
Assert.assertEquals(1, summary.getOutgoing().size());
expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(writeDataset), 3, fieldRelations);
Assert.assertEquals(expectedRelations, summary.getOutgoing().iterator().next());
LineageAdmin lineageAdmin = getLineageAdmin();
ProgramId programId = appId.workflow(SmartWorkflow.NAME);
RunId runId = RunIds.fromString(workflowManager.getHistory().iterator().next().getPid());
// get dataset lineage for src dataset
Tasks.waitFor(2, () -> {
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
return lineage.getRelations().size();
}, 10, TimeUnit.SECONDS);
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(readDataset), programId, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(writeDataset), programId, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
// get dataset lineage for dest dataset, in this test they should be same
lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(writeDataset), 0, System.currentTimeMillis(), 1, "workflow");
Assert.assertEquals(2, lineage.getRelations().size());
Assert.assertEquals(expectedLineage, lineage.getRelations());
}
use of io.cdap.cdap.metadata.FieldLineageAdmin in project cdap by caskdata.
the class TestBase method initialize.
@BeforeClass
public static void initialize() throws Exception {
if (nestedStartCount++ > 0) {
return;
}
File localDataDir = TMP_FOLDER.newFolder();
cConf = createCConf(localDataDir);
CConfiguration previewCConf = createPreviewConf(cConf);
LevelDBTableService previewLevelDBTableService = new LevelDBTableService();
previewLevelDBTableService.setConfiguration(previewCConf);
// enable default services
File capabilityFolder = new File(localDataDir.toString(), "capability");
capabilityFolder.mkdir();
cConf.set(Constants.Capability.CONFIG_DIR, capabilityFolder.getAbsolutePath());
cConf.setInt(Constants.Capability.AUTO_INSTALL_THREADS, 5);
org.apache.hadoop.conf.Configuration hConf = new org.apache.hadoop.conf.Configuration();
hConf.addResource("mapred-site-local.xml");
hConf.reloadConfiguration();
hConf.set(Constants.CFG_LOCAL_DATA_DIR, localDataDir.getAbsolutePath());
hConf.set(Constants.AppFabric.OUTPUT_DIR, cConf.get(Constants.AppFabric.OUTPUT_DIR));
hConf.set("hadoop.tmp.dir", new File(localDataDir, cConf.get(Constants.AppFabric.TEMP_DIR)).getAbsolutePath());
// Windows specific requirements
if (OSDetector.isWindows()) {
File tmpDir = TMP_FOLDER.newFolder();
File binDir = new File(tmpDir, "bin");
Assert.assertTrue(binDir.mkdirs());
copyTempFile("hadoop.dll", tmpDir);
copyTempFile("winutils.exe", binDir);
System.setProperty("hadoop.home.dir", tmpDir.getAbsolutePath());
System.load(new File(tmpDir, "hadoop.dll").getAbsolutePath());
}
injector = Guice.createInjector(createDataFabricModule(), new TransactionExecutorModule(), new DataSetsModules().getStandaloneModules(), new DataSetServiceModules().getInMemoryModules(), new ConfigModule(cConf, hConf), RemoteAuthenticatorModules.getNoOpModule(), new IOModule(), new LocalLocationModule(), new InMemoryDiscoveryModule(), new AppFabricServiceRuntimeModule(cConf).getInMemoryModules(), new MonitorHandlerModule(false), new AuthenticationContextModules().getMasterModule(), new AuthorizationModule(), new AuthorizationEnforcementModule().getInMemoryModules(), new ProgramRunnerRuntimeModule().getInMemoryModules(), new SecureStoreServerModule(), new MetadataReaderWriterModules().getInMemoryModules(), new MetadataServiceModule(), new AbstractModule() {
@Override
protected void configure() {
bind(MetricsManager.class).toProvider(MetricsManagerProvider.class);
}
}, new MetricsClientRuntimeModule().getInMemoryModules(), new LocalLogAppenderModule(), new LogReaderRuntimeModules().getInMemoryModules(), new ExploreRuntimeModule().getInMemoryModules(), new ExploreClientModule(), new MessagingServerRuntimeModule().getInMemoryModules(), new PreviewConfigModule(cConf, new Configuration(), SConfiguration.create()), new PreviewManagerModule(false), new PreviewRunnerManagerModule().getInMemoryModules(), new SupportBundleServiceModule(), new MockProvisionerModule(), new AbstractModule() {
@Override
protected void configure() {
install(new FactoryModuleBuilder().implement(ApplicationManager.class, DefaultApplicationManager.class).build(ApplicationManagerFactory.class));
install(new FactoryModuleBuilder().implement(ArtifactManager.class, DefaultArtifactManager.class).build(ArtifactManagerFactory.class));
bind(TemporaryFolder.class).toInstance(TMP_FOLDER);
bind(AuthorizationHandler.class).in(Scopes.SINGLETON);
// Needed by MonitorHandlerModuler
bind(TwillRunner.class).to(NoopTwillRunnerService.class);
bind(MetadataSubscriberService.class).in(Scopes.SINGLETON);
}
});
messagingService = injector.getInstance(MessagingService.class);
if (messagingService instanceof Service) {
((Service) messagingService).startAndWait();
}
txService = injector.getInstance(TransactionManager.class);
txService.startAndWait();
metadataSubscriberService = injector.getInstance(MetadataSubscriberService.class);
metadataStorage = injector.getInstance(MetadataStorage.class);
metadataAdmin = injector.getInstance(MetadataAdmin.class);
metadataStorage.createIndex();
metadataService = injector.getInstance(MetadataService.class);
metadataService.startAndWait();
// Define all StructuredTable before starting any services that need StructuredTable
StoreDefinition.createAllTables(injector.getInstance(StructuredTableAdmin.class));
dsOpService = injector.getInstance(DatasetOpExecutorService.class);
dsOpService.startAndWait();
datasetService = injector.getInstance(DatasetService.class);
datasetService.startAndWait();
metricsCollectionService = injector.getInstance(MetricsCollectionService.class);
metricsCollectionService.startAndWait();
if (cConf.getBoolean(Constants.Explore.EXPLORE_ENABLED)) {
exploreExecutorService = injector.getInstance(ExploreExecutorService.class);
exploreExecutorService.startAndWait();
// wait for explore service to be discoverable
DiscoveryServiceClient discoveryService = injector.getInstance(DiscoveryServiceClient.class);
EndpointStrategy endpointStrategy = new RandomEndpointStrategy(() -> discoveryService.discover(Constants.Service.EXPLORE_HTTP_USER_SERVICE));
Preconditions.checkNotNull(endpointStrategy.pick(5, TimeUnit.SECONDS), "%s service is not up after 5 seconds", Constants.Service.EXPLORE_HTTP_USER_SERVICE);
exploreClient = injector.getInstance(ExploreClient.class);
}
programScheduler = injector.getInstance(Scheduler.class);
if (programScheduler instanceof Service) {
((Service) programScheduler).startAndWait();
}
testManager = injector.getInstance(UnitTestManager.class);
metricsManager = injector.getInstance(MetricsManager.class);
accessControllerInstantiator = injector.getInstance(AccessControllerInstantiator.class);
// This is needed so the logged-in user can successfully create the default namespace
if (cConf.getBoolean(Constants.Security.Authorization.ENABLED)) {
String user = System.getProperty("user.name");
SecurityRequestContext.setUserId(user);
InstanceId instance = new InstanceId(cConf.get(Constants.INSTANCE_NAME));
Principal principal = new Principal(user, Principal.PrincipalType.USER);
accessControllerInstantiator.get().grant(Authorizable.fromEntityId(instance), principal, EnumSet.allOf(StandardPermission.class));
accessControllerInstantiator.get().grant(Authorizable.fromEntityId(NamespaceId.DEFAULT), principal, EnumSet.allOf(StandardPermission.class));
}
namespaceAdmin = injector.getInstance(NamespaceAdmin.class);
if (firstInit) {
// only create the default namespace on first test. if multiple tests are run in the same JVM,
// then any time after the first time, the default namespace already exists. That is because
// the namespaceAdmin.delete(Id.Namespace.DEFAULT) in finish() only clears the default namespace
// but does not remove it entirely
namespaceAdmin.create(NamespaceMeta.DEFAULT);
ProfileService profileService = injector.getInstance(ProfileService.class);
profileService.saveProfile(ProfileId.NATIVE, Profile.NATIVE);
}
secureStore = injector.getInstance(SecureStore.class);
secureStoreManager = injector.getInstance(SecureStoreManager.class);
messagingContext = new MultiThreadMessagingContext(messagingService);
firstInit = false;
previewHttpServer = injector.getInstance(PreviewHttpServer.class);
previewHttpServer.startAndWait();
fieldLineageAdmin = injector.getInstance(FieldLineageAdmin.class);
lineageAdmin = injector.getInstance(LineageAdmin.class);
metadataSubscriberService.startAndWait();
previewRunnerManager = injector.getInstance(PreviewRunnerManager.class);
if (previewRunnerManager instanceof Service) {
((Service) previewRunnerManager).startAndWait();
}
appFabricServer = injector.getInstance(AppFabricServer.class);
appFabricServer.startAndWait();
preferencesService = injector.getInstance(PreferencesService.class);
scheduler = injector.getInstance(Scheduler.class);
if (scheduler instanceof Service) {
((Service) scheduler).startAndWait();
}
if (scheduler instanceof CoreSchedulerService) {
((CoreSchedulerService) scheduler).waitUntilFunctional(10, TimeUnit.SECONDS);
}
supportBundleInternalService = injector.getInstance(SupportBundleInternalService.class);
supportBundleInternalService.startAndWait();
appFabricHealthCheckService = injector.getInstance(HealthCheckService.class);
appFabricHealthCheckService.helper(Constants.AppFabricHealthCheck.APP_FABRIC_HEALTH_CHECK_SERVICE, cConf, Constants.Service.MASTER_SERVICES_BIND_ADDRESS);
appFabricHealthCheckService.startAndWait();
}
use of io.cdap.cdap.metadata.FieldLineageAdmin in project cdap by caskdata.
the class DataStreamsTest method testLineageWithMacros.
@Test
public void testLineageWithMacros() throws Exception {
Schema schema = Schema.recordOf("test", Schema.Field.of("key", Schema.of(Schema.Type.STRING)), Schema.Field.of("value", Schema.of(Schema.Type.STRING)));
List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("key", "key1").set("value", "value1").build(), StructuredRecord.builder(schema).set("key", "key2").set("value", "value2").build());
String srcName = "lineageSource";
String sinkName1 = "lineageOutput1";
String sinkName2 = "lineageOutput2";
DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 0L, srcName))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "identity").addConnection("identity", "sink").setCheckpointDir(checkpointDir).setBatchInterval("1s").build();
ApplicationId appId = NamespaceId.DEFAULT.app("lineageApp");
AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
ApplicationManager appManager = deployApplication(appId, appRequest);
ProgramId spark = appId.spark(DataStreamsSparkLauncher.NAME);
RunId runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName1);
FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
LineageAdmin lineageAdmin = getLineageAdmin();
// wait for the lineage get populated
Tasks.waitFor(true, () -> {
Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
}, 10, TimeUnit.SECONDS);
Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName1), spark, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
Set<DatasetFieldLineageSummary.FieldLineageRelations> outgoing = summary.getOutgoing();
Assert.assertEquals(1, outgoing.size());
Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName1), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
Assert.assertEquals(expectedRelations, outgoing);
// here sleep for 1 seconds to start the second run because the dataset lineage is storing based on unit second
TimeUnit.SECONDS.sleep(1);
long startTimeMillis = System.currentTimeMillis();
runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName2);
// wait for the lineage get populated
Tasks.waitFor(true, () -> {
Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
long end = System.currentTimeMillis();
DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, end);
return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
}, 10, TimeUnit.SECONDS);
lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName2), spark, AccessType.WRITE, runId));
Assert.assertEquals(expectedLineage, lineage.getRelations());
summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, System.currentTimeMillis());
Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
Assert.assertTrue(summary.getIncoming().isEmpty());
outgoing = summary.getOutgoing();
Assert.assertEquals(1, outgoing.size());
expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName2), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
Assert.assertEquals(expectedRelations, outgoing);
}
Aggregations