Search in sources :

Example 1 with FieldLineageAdmin

use of io.cdap.cdap.metadata.FieldLineageAdmin in project cdap by caskdata.

the class DataPipelineTest method testActionFieldLineage.

private void testActionFieldLineage(Engine engine) throws Exception {
    String readDataset = "ActionReadDataset" + engine;
    String writeDataset = "ActionWriteDataset" + engine;
    List<String> srcFields = ImmutableList.of("srcField1", "srcField2", "srcField3");
    Set<String> destFields = ImmutableSet.of("destField1", "destField2", "destField3");
    List<Operation> operations = new ArrayList<>();
    /*
     *          |---------> srcField1 -> destField1----|
     *          |                                      |
     * ActionReadDataset -> srcField2 -> destField2 ---|-> ActionWriteDataset
     *          |                                      |
     *          |---------> srcField3 -> destField3 ---|
     */
    operations.add(new ReadOperation("Read", "1st operation", EndPoint.of("default", readDataset), srcFields));
    operations.add(new TransformOperation("Transform1", "2nd operation", Collections.singletonList(InputField.of("Read", "srcField1")), "destField1"));
    operations.add(new TransformOperation("Transform2", "3rd operation", Collections.singletonList(InputField.of("Read", "srcField2")), "destField2"));
    operations.add(new TransformOperation("Transform3", "4th operation", Collections.singletonList(InputField.of("Read", "srcField3")), "destField3"));
    operations.add(new WriteOperation("Write", "5th operation", EndPoint.of("default", writeDataset), ImmutableList.of(InputField.of("Transform1", "destField1"), InputField.of("Transform2", "destField2"), InputField.of("Transform3", "destField3"))));
    ETLStage action = new ETLStage("action", FieldLineageAction.getPlugin(readDataset, writeDataset, operations));
    ETLBatchConfig etlConfig = ETLBatchConfig.builder().addStage(action).setEngine(engine).build();
    AppRequest<ETLBatchConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationId appId = NamespaceId.DEFAULT.app("ActionFieldLineage-" + engine);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    WorkflowManager workflowManager = appManager.getWorkflowManager(SmartWorkflow.NAME);
    workflowManager.startAndWaitForGoodRun(ProgramRunStatus.COMPLETED, 5, TimeUnit.MINUTES);
    FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
    // get field lineage for dest dataset
    DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", writeDataset), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(writeDataset), summary.getDatasetId());
    Assert.assertEquals(destFields, summary.getFields());
    Assert.assertTrue(summary.getOutgoing().isEmpty());
    Assert.assertEquals(1, summary.getIncoming().size());
    Set<FieldRelation> fieldRelations = ImmutableSet.of(new FieldRelation("srcField1", "destField1"), new FieldRelation("srcField2", "destField2"), new FieldRelation("srcField3", "destField3"));
    DatasetFieldLineageSummary.FieldLineageRelations expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(readDataset), 3, fieldRelations);
    Assert.assertEquals(expectedRelations, summary.getIncoming().iterator().next());
    // get field lineage for src dataset
    summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", readDataset), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(readDataset), summary.getDatasetId());
    Assert.assertEquals(new HashSet<>(srcFields), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    Assert.assertEquals(1, summary.getOutgoing().size());
    expectedRelations = new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(writeDataset), 3, fieldRelations);
    Assert.assertEquals(expectedRelations, summary.getOutgoing().iterator().next());
    LineageAdmin lineageAdmin = getLineageAdmin();
    ProgramId programId = appId.workflow(SmartWorkflow.NAME);
    RunId runId = RunIds.fromString(workflowManager.getHistory().iterator().next().getPid());
    // get dataset lineage for src dataset
    Tasks.waitFor(2, () -> {
        Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
        return lineage.getRelations().size();
    }, 10, TimeUnit.SECONDS);
    Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(readDataset), 0, System.currentTimeMillis(), 1, "workflow");
    Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(readDataset), programId, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(writeDataset), programId, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    // get dataset lineage for dest dataset, in this test they should be same
    lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(writeDataset), 0, System.currentTimeMillis(), 1, "workflow");
    Assert.assertEquals(2, lineage.getRelations().size());
    Assert.assertEquals(expectedLineage, lineage.getRelations());
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) FieldRelation(io.cdap.cdap.metadata.FieldRelation) WorkflowManager(io.cdap.cdap.test.WorkflowManager) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MetadataOperation(io.cdap.cdap.data2.metadata.writer.MetadataOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) FieldRelation(io.cdap.cdap.metadata.FieldRelation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) RunId(org.apache.twill.api.RunId) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) DatasetFieldLineageSummary(io.cdap.cdap.metadata.DatasetFieldLineageSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin)

Example 2 with FieldLineageAdmin

use of io.cdap.cdap.metadata.FieldLineageAdmin in project cdap by caskdata.

the class TestBase method initialize.

@BeforeClass
public static void initialize() throws Exception {
    if (nestedStartCount++ > 0) {
        return;
    }
    File localDataDir = TMP_FOLDER.newFolder();
    cConf = createCConf(localDataDir);
    CConfiguration previewCConf = createPreviewConf(cConf);
    LevelDBTableService previewLevelDBTableService = new LevelDBTableService();
    previewLevelDBTableService.setConfiguration(previewCConf);
    // enable default services
    File capabilityFolder = new File(localDataDir.toString(), "capability");
    capabilityFolder.mkdir();
    cConf.set(Constants.Capability.CONFIG_DIR, capabilityFolder.getAbsolutePath());
    cConf.setInt(Constants.Capability.AUTO_INSTALL_THREADS, 5);
    org.apache.hadoop.conf.Configuration hConf = new org.apache.hadoop.conf.Configuration();
    hConf.addResource("mapred-site-local.xml");
    hConf.reloadConfiguration();
    hConf.set(Constants.CFG_LOCAL_DATA_DIR, localDataDir.getAbsolutePath());
    hConf.set(Constants.AppFabric.OUTPUT_DIR, cConf.get(Constants.AppFabric.OUTPUT_DIR));
    hConf.set("hadoop.tmp.dir", new File(localDataDir, cConf.get(Constants.AppFabric.TEMP_DIR)).getAbsolutePath());
    // Windows specific requirements
    if (OSDetector.isWindows()) {
        File tmpDir = TMP_FOLDER.newFolder();
        File binDir = new File(tmpDir, "bin");
        Assert.assertTrue(binDir.mkdirs());
        copyTempFile("hadoop.dll", tmpDir);
        copyTempFile("winutils.exe", binDir);
        System.setProperty("hadoop.home.dir", tmpDir.getAbsolutePath());
        System.load(new File(tmpDir, "hadoop.dll").getAbsolutePath());
    }
    injector = Guice.createInjector(createDataFabricModule(), new TransactionExecutorModule(), new DataSetsModules().getStandaloneModules(), new DataSetServiceModules().getInMemoryModules(), new ConfigModule(cConf, hConf), RemoteAuthenticatorModules.getNoOpModule(), new IOModule(), new LocalLocationModule(), new InMemoryDiscoveryModule(), new AppFabricServiceRuntimeModule(cConf).getInMemoryModules(), new MonitorHandlerModule(false), new AuthenticationContextModules().getMasterModule(), new AuthorizationModule(), new AuthorizationEnforcementModule().getInMemoryModules(), new ProgramRunnerRuntimeModule().getInMemoryModules(), new SecureStoreServerModule(), new MetadataReaderWriterModules().getInMemoryModules(), new MetadataServiceModule(), new AbstractModule() {

        @Override
        protected void configure() {
            bind(MetricsManager.class).toProvider(MetricsManagerProvider.class);
        }
    }, new MetricsClientRuntimeModule().getInMemoryModules(), new LocalLogAppenderModule(), new LogReaderRuntimeModules().getInMemoryModules(), new ExploreRuntimeModule().getInMemoryModules(), new ExploreClientModule(), new MessagingServerRuntimeModule().getInMemoryModules(), new PreviewConfigModule(cConf, new Configuration(), SConfiguration.create()), new PreviewManagerModule(false), new PreviewRunnerManagerModule().getInMemoryModules(), new SupportBundleServiceModule(), new MockProvisionerModule(), new AbstractModule() {

        @Override
        protected void configure() {
            install(new FactoryModuleBuilder().implement(ApplicationManager.class, DefaultApplicationManager.class).build(ApplicationManagerFactory.class));
            install(new FactoryModuleBuilder().implement(ArtifactManager.class, DefaultArtifactManager.class).build(ArtifactManagerFactory.class));
            bind(TemporaryFolder.class).toInstance(TMP_FOLDER);
            bind(AuthorizationHandler.class).in(Scopes.SINGLETON);
            // Needed by MonitorHandlerModuler
            bind(TwillRunner.class).to(NoopTwillRunnerService.class);
            bind(MetadataSubscriberService.class).in(Scopes.SINGLETON);
        }
    });
    messagingService = injector.getInstance(MessagingService.class);
    if (messagingService instanceof Service) {
        ((Service) messagingService).startAndWait();
    }
    txService = injector.getInstance(TransactionManager.class);
    txService.startAndWait();
    metadataSubscriberService = injector.getInstance(MetadataSubscriberService.class);
    metadataStorage = injector.getInstance(MetadataStorage.class);
    metadataAdmin = injector.getInstance(MetadataAdmin.class);
    metadataStorage.createIndex();
    metadataService = injector.getInstance(MetadataService.class);
    metadataService.startAndWait();
    // Define all StructuredTable before starting any services that need StructuredTable
    StoreDefinition.createAllTables(injector.getInstance(StructuredTableAdmin.class));
    dsOpService = injector.getInstance(DatasetOpExecutorService.class);
    dsOpService.startAndWait();
    datasetService = injector.getInstance(DatasetService.class);
    datasetService.startAndWait();
    metricsCollectionService = injector.getInstance(MetricsCollectionService.class);
    metricsCollectionService.startAndWait();
    if (cConf.getBoolean(Constants.Explore.EXPLORE_ENABLED)) {
        exploreExecutorService = injector.getInstance(ExploreExecutorService.class);
        exploreExecutorService.startAndWait();
        // wait for explore service to be discoverable
        DiscoveryServiceClient discoveryService = injector.getInstance(DiscoveryServiceClient.class);
        EndpointStrategy endpointStrategy = new RandomEndpointStrategy(() -> discoveryService.discover(Constants.Service.EXPLORE_HTTP_USER_SERVICE));
        Preconditions.checkNotNull(endpointStrategy.pick(5, TimeUnit.SECONDS), "%s service is not up after 5 seconds", Constants.Service.EXPLORE_HTTP_USER_SERVICE);
        exploreClient = injector.getInstance(ExploreClient.class);
    }
    programScheduler = injector.getInstance(Scheduler.class);
    if (programScheduler instanceof Service) {
        ((Service) programScheduler).startAndWait();
    }
    testManager = injector.getInstance(UnitTestManager.class);
    metricsManager = injector.getInstance(MetricsManager.class);
    accessControllerInstantiator = injector.getInstance(AccessControllerInstantiator.class);
    // This is needed so the logged-in user can successfully create the default namespace
    if (cConf.getBoolean(Constants.Security.Authorization.ENABLED)) {
        String user = System.getProperty("user.name");
        SecurityRequestContext.setUserId(user);
        InstanceId instance = new InstanceId(cConf.get(Constants.INSTANCE_NAME));
        Principal principal = new Principal(user, Principal.PrincipalType.USER);
        accessControllerInstantiator.get().grant(Authorizable.fromEntityId(instance), principal, EnumSet.allOf(StandardPermission.class));
        accessControllerInstantiator.get().grant(Authorizable.fromEntityId(NamespaceId.DEFAULT), principal, EnumSet.allOf(StandardPermission.class));
    }
    namespaceAdmin = injector.getInstance(NamespaceAdmin.class);
    if (firstInit) {
        // only create the default namespace on first test. if multiple tests are run in the same JVM,
        // then any time after the first time, the default namespace already exists. That is because
        // the namespaceAdmin.delete(Id.Namespace.DEFAULT) in finish() only clears the default namespace
        // but does not remove it entirely
        namespaceAdmin.create(NamespaceMeta.DEFAULT);
        ProfileService profileService = injector.getInstance(ProfileService.class);
        profileService.saveProfile(ProfileId.NATIVE, Profile.NATIVE);
    }
    secureStore = injector.getInstance(SecureStore.class);
    secureStoreManager = injector.getInstance(SecureStoreManager.class);
    messagingContext = new MultiThreadMessagingContext(messagingService);
    firstInit = false;
    previewHttpServer = injector.getInstance(PreviewHttpServer.class);
    previewHttpServer.startAndWait();
    fieldLineageAdmin = injector.getInstance(FieldLineageAdmin.class);
    lineageAdmin = injector.getInstance(LineageAdmin.class);
    metadataSubscriberService.startAndWait();
    previewRunnerManager = injector.getInstance(PreviewRunnerManager.class);
    if (previewRunnerManager instanceof Service) {
        ((Service) previewRunnerManager).startAndWait();
    }
    appFabricServer = injector.getInstance(AppFabricServer.class);
    appFabricServer.startAndWait();
    preferencesService = injector.getInstance(PreferencesService.class);
    scheduler = injector.getInstance(Scheduler.class);
    if (scheduler instanceof Service) {
        ((Service) scheduler).startAndWait();
    }
    if (scheduler instanceof CoreSchedulerService) {
        ((CoreSchedulerService) scheduler).waitUntilFunctional(10, TimeUnit.SECONDS);
    }
    supportBundleInternalService = injector.getInstance(SupportBundleInternalService.class);
    supportBundleInternalService.startAndWait();
    appFabricHealthCheckService = injector.getInstance(HealthCheckService.class);
    appFabricHealthCheckService.helper(Constants.AppFabricHealthCheck.APP_FABRIC_HEALTH_CHECK_SERVICE, cConf, Constants.Service.MASTER_SERVICES_BIND_ADDRESS);
    appFabricHealthCheckService.startAndWait();
}
Also used : DataSetServiceModules(io.cdap.cdap.data.runtime.DataSetServiceModules) PreviewManagerModule(io.cdap.cdap.app.preview.PreviewManagerModule) DiscoveryServiceClient(org.apache.twill.discovery.DiscoveryServiceClient) Configuration(org.apache.hadoop.conf.Configuration) SConfiguration(io.cdap.cdap.common.conf.SConfiguration) CConfiguration(io.cdap.cdap.common.conf.CConfiguration) TwillRunner(org.apache.twill.api.TwillRunner) DatasetService(io.cdap.cdap.data2.datafabric.dataset.service.DatasetService) MetricsClientRuntimeModule(io.cdap.cdap.metrics.guice.MetricsClientRuntimeModule) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) StandardPermission(io.cdap.cdap.proto.security.StandardPermission) PreviewConfigModule(io.cdap.cdap.app.preview.PreviewConfigModule) RandomEndpointStrategy(io.cdap.cdap.common.discovery.RandomEndpointStrategy) EndpointStrategy(io.cdap.cdap.common.discovery.EndpointStrategy) CoreSchedulerService(io.cdap.cdap.scheduler.CoreSchedulerService) Configuration(org.apache.hadoop.conf.Configuration) SecureStoreManager(io.cdap.cdap.api.security.store.SecureStoreManager) SupportBundleServiceModule(io.cdap.cdap.support.app.guice.SupportBundleServiceModule) PreviewRunnerManagerModule(io.cdap.cdap.app.preview.PreviewRunnerManagerModule) PreviewRunnerManager(io.cdap.cdap.app.preview.PreviewRunnerManager) AuthorizationModule(io.cdap.cdap.app.guice.AuthorizationModule) HealthCheckService(io.cdap.cdap.common.service.HealthCheckService) MetricsCollectionService(io.cdap.cdap.api.metrics.MetricsCollectionService) InstanceId(io.cdap.cdap.proto.id.InstanceId) AuthenticationContextModules(io.cdap.cdap.security.auth.context.AuthenticationContextModules) MetadataServiceModule(io.cdap.cdap.metadata.MetadataServiceModule) NamespaceAdmin(io.cdap.cdap.common.namespace.NamespaceAdmin) ExploreRuntimeModule(io.cdap.cdap.explore.guice.ExploreRuntimeModule) CConfiguration(io.cdap.cdap.common.conf.CConfiguration) SecureStore(io.cdap.cdap.api.security.store.SecureStore) MetadataReaderWriterModules(io.cdap.cdap.metadata.MetadataReaderWriterModules) SupportBundleInternalService(io.cdap.cdap.support.internal.app.services.SupportBundleInternalService) ExploreClientModule(io.cdap.cdap.explore.guice.ExploreClientModule) MetadataSubscriberService(io.cdap.cdap.metadata.MetadataSubscriberService) TransactionManager(org.apache.tephra.TransactionManager) MetadataStorage(io.cdap.cdap.spi.metadata.MetadataStorage) File(java.io.File) AppFabricServiceRuntimeModule(io.cdap.cdap.app.guice.AppFabricServiceRuntimeModule) AuthorizationEnforcementModule(io.cdap.cdap.security.authorization.AuthorizationEnforcementModule) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) IOModule(io.cdap.cdap.common.guice.IOModule) AuthorizationHandler(io.cdap.cdap.gateway.handlers.AuthorizationHandler) ExploreClient(io.cdap.cdap.explore.client.ExploreClient) InMemoryDiscoveryModule(io.cdap.cdap.common.guice.InMemoryDiscoveryModule) ConfigModule(io.cdap.cdap.common.guice.ConfigModule) PreviewConfigModule(io.cdap.cdap.app.preview.PreviewConfigModule) FactoryModuleBuilder(com.google.inject.assistedinject.FactoryModuleBuilder) ArtifactManagerFactory(io.cdap.cdap.test.internal.ArtifactManagerFactory) StructuredTableAdmin(io.cdap.cdap.spi.data.StructuredTableAdmin) Scheduler(io.cdap.cdap.scheduler.Scheduler) AppFabricServer(io.cdap.cdap.internal.app.services.AppFabricServer) MessagingServerRuntimeModule(io.cdap.cdap.messaging.guice.MessagingServerRuntimeModule) MultiThreadMessagingContext(io.cdap.cdap.messaging.context.MultiThreadMessagingContext) MetadataService(io.cdap.cdap.metadata.MetadataService) MonitorHandlerModule(io.cdap.cdap.app.guice.MonitorHandlerModule) PreferencesService(io.cdap.cdap.config.PreferencesService) TransactionExecutorModule(io.cdap.cdap.data.runtime.TransactionExecutorModule) LocalLocationModule(io.cdap.cdap.common.guice.LocalLocationModule) TemporaryFolder(org.junit.rules.TemporaryFolder) MockProvisionerModule(io.cdap.cdap.internal.provision.MockProvisionerModule) MetadataAdmin(io.cdap.cdap.metadata.MetadataAdmin) ApplicationManagerFactory(io.cdap.cdap.test.internal.ApplicationManagerFactory) LogReaderRuntimeModules(io.cdap.cdap.logging.guice.LogReaderRuntimeModules) DataSetsModules(io.cdap.cdap.data.runtime.DataSetsModules) NoopTwillRunnerService(io.cdap.cdap.common.twill.NoopTwillRunnerService) PreferencesService(io.cdap.cdap.config.PreferencesService) MetadataSubscriberService(io.cdap.cdap.metadata.MetadataSubscriberService) ExploreExecutorService(io.cdap.cdap.explore.executor.ExploreExecutorService) LevelDBTableService(io.cdap.cdap.data2.dataset2.lib.table.leveldb.LevelDBTableService) DatasetOpExecutorService(io.cdap.cdap.data2.datafabric.dataset.service.executor.DatasetOpExecutorService) ProfileService(io.cdap.cdap.internal.profile.ProfileService) Service(com.google.common.util.concurrent.Service) CapabilityManagementService(io.cdap.cdap.internal.capability.CapabilityManagementService) MetadataService(io.cdap.cdap.metadata.MetadataService) HealthCheckService(io.cdap.cdap.common.service.HealthCheckService) MessagingService(io.cdap.cdap.messaging.MessagingService) SupportBundleInternalService(io.cdap.cdap.support.internal.app.services.SupportBundleInternalService) DatasetService(io.cdap.cdap.data2.datafabric.dataset.service.DatasetService) CoreSchedulerService(io.cdap.cdap.scheduler.CoreSchedulerService) MetricsCollectionService(io.cdap.cdap.api.metrics.MetricsCollectionService) AccessControllerInstantiator(io.cdap.cdap.security.authorization.AccessControllerInstantiator) AbstractModule(com.google.inject.AbstractModule) MessagingService(io.cdap.cdap.messaging.MessagingService) ProgramRunnerRuntimeModule(io.cdap.cdap.app.guice.ProgramRunnerRuntimeModule) ProfileService(io.cdap.cdap.internal.profile.ProfileService) LocalLogAppenderModule(io.cdap.cdap.logging.guice.LocalLogAppenderModule) LevelDBTableService(io.cdap.cdap.data2.dataset2.lib.table.leveldb.LevelDBTableService) DatasetOpExecutorService(io.cdap.cdap.data2.datafabric.dataset.service.executor.DatasetOpExecutorService) ExploreExecutorService(io.cdap.cdap.explore.executor.ExploreExecutorService) SecureStoreServerModule(io.cdap.cdap.security.guice.SecureStoreServerModule) Principal(io.cdap.cdap.proto.security.Principal) RandomEndpointStrategy(io.cdap.cdap.common.discovery.RandomEndpointStrategy) PreviewHttpServer(io.cdap.cdap.app.preview.PreviewHttpServer) BeforeClass(org.junit.BeforeClass)

Example 3 with FieldLineageAdmin

use of io.cdap.cdap.metadata.FieldLineageAdmin in project cdap by caskdata.

the class DataStreamsTest method testLineageWithMacros.

@Test
public void testLineageWithMacros() throws Exception {
    Schema schema = Schema.recordOf("test", Schema.Field.of("key", Schema.of(Schema.Type.STRING)), Schema.Field.of("value", Schema.of(Schema.Type.STRING)));
    List<StructuredRecord> input = ImmutableList.of(StructuredRecord.builder(schema).set("key", "key1").set("value", "value1").build(), StructuredRecord.builder(schema).set("key", "key2").set("value", "value2").build());
    String srcName = "lineageSource";
    String sinkName1 = "lineageOutput1";
    String sinkName2 = "lineageOutput2";
    DataStreamsConfig etlConfig = DataStreamsConfig.builder().addStage(new ETLStage("source", MockSource.getPlugin(schema, input, 0L, srcName))).addStage(new ETLStage("sink", MockSink.getPlugin("${output}"))).addStage(new ETLStage("identity", IdentityTransform.getPlugin())).addConnection("source", "identity").addConnection("identity", "sink").setCheckpointDir(checkpointDir).setBatchInterval("1s").build();
    ApplicationId appId = NamespaceId.DEFAULT.app("lineageApp");
    AppRequest<DataStreamsConfig> appRequest = new AppRequest<>(APP_ARTIFACT, etlConfig);
    ApplicationManager appManager = deployApplication(appId, appRequest);
    ProgramId spark = appId.spark(DataStreamsSparkLauncher.NAME);
    RunId runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName1);
    FieldLineageAdmin fieldAdmin = getFieldLineageAdmin();
    LineageAdmin lineageAdmin = getLineageAdmin();
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    Lineage lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), 0, System.currentTimeMillis(), 1, "workflow");
    Set<Relation> expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName1), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    DatasetFieldLineageSummary summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), 0, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    Set<DatasetFieldLineageSummary.FieldLineageRelations> expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName1), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
    // here sleep for 1 seconds to start the second run because the dataset lineage is storing based on unit second
    TimeUnit.SECONDS.sleep(1);
    long startTimeMillis = System.currentTimeMillis();
    runId = testLineageWithMacro(appManager, new HashSet<>(input), sinkName2);
    // wait for the lineage get populated
    Tasks.waitFor(true, () -> {
        Lineage dsLineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
        long end = System.currentTimeMillis();
        DatasetFieldLineageSummary fll = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, end);
        return dsLineage.getRelations().size() == 2 && !fll.getOutgoing().isEmpty();
    }, 10, TimeUnit.SECONDS);
    lineage = lineageAdmin.computeLineage(NamespaceId.DEFAULT.dataset(srcName), startTimeMillis, System.currentTimeMillis(), 1, "workflow");
    expectedLineage = ImmutableSet.of(new Relation(NamespaceId.DEFAULT.dataset(srcName), spark, AccessType.READ, runId), new Relation(NamespaceId.DEFAULT.dataset(sinkName2), spark, AccessType.WRITE, runId));
    Assert.assertEquals(expectedLineage, lineage.getRelations());
    summary = fieldAdmin.getDatasetFieldLineage(Constants.FieldLineage.Direction.BOTH, EndPoint.of("default", srcName), startTimeMillis, System.currentTimeMillis());
    Assert.assertEquals(NamespaceId.DEFAULT.dataset(srcName), summary.getDatasetId());
    Assert.assertEquals(ImmutableSet.of("key", "value"), summary.getFields());
    Assert.assertTrue(summary.getIncoming().isEmpty());
    outgoing = summary.getOutgoing();
    Assert.assertEquals(1, outgoing.size());
    expectedRelations = Collections.singleton(new DatasetFieldLineageSummary.FieldLineageRelations(NamespaceId.DEFAULT.dataset(sinkName2), 2, ImmutableSet.of(new FieldRelation("key", "key"), new FieldRelation("value", "value"))));
    Assert.assertEquals(expectedRelations, outgoing);
}
Also used : ApplicationManager(io.cdap.cdap.test.ApplicationManager) FieldRelation(io.cdap.cdap.metadata.FieldRelation) Schema(io.cdap.cdap.api.data.schema.Schema) LineageAdmin(io.cdap.cdap.metadata.LineageAdmin) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) FieldRelation(io.cdap.cdap.metadata.FieldRelation) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) DataStreamsConfig(io.cdap.cdap.etl.proto.v2.DataStreamsConfig) AppRequest(io.cdap.cdap.proto.artifact.AppRequest) DatasetFieldLineageSummary(io.cdap.cdap.metadata.DatasetFieldLineageSummary) ETLStage(io.cdap.cdap.etl.proto.v2.ETLStage) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) FieldLineageAdmin(io.cdap.cdap.metadata.FieldLineageAdmin) Test(org.junit.Test)

Aggregations

FieldLineageAdmin (io.cdap.cdap.metadata.FieldLineageAdmin)3 LineageAdmin (io.cdap.cdap.metadata.LineageAdmin)3 Lineage (io.cdap.cdap.data2.metadata.lineage.Lineage)2 Relation (io.cdap.cdap.data2.metadata.lineage.Relation)2 Service (com.google.common.util.concurrent.Service)1 AbstractModule (com.google.inject.AbstractModule)1 FactoryModuleBuilder (com.google.inject.assistedinject.FactoryModuleBuilder)1 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)1 Schema (io.cdap.cdap.api.data.schema.Schema)1 Operation (io.cdap.cdap.api.lineage.field.Operation)1 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)1 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)1 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)1 MetricsCollectionService (io.cdap.cdap.api.metrics.MetricsCollectionService)1 SecureStore (io.cdap.cdap.api.security.store.SecureStore)1 SecureStoreManager (io.cdap.cdap.api.security.store.SecureStoreManager)1 AppFabricServiceRuntimeModule (io.cdap.cdap.app.guice.AppFabricServiceRuntimeModule)1 AuthorizationModule (io.cdap.cdap.app.guice.AuthorizationModule)1 MonitorHandlerModule (io.cdap.cdap.app.guice.MonitorHandlerModule)1 ProgramRunnerRuntimeModule (io.cdap.cdap.app.guice.ProgramRunnerRuntimeModule)1