Search in sources :

Example 16 with ProgramRunId

use of io.cdap.cdap.proto.id.ProgramRunId in project cdap by caskdata.

the class LineageAdmin method doComputeLineage.

private Lineage doComputeLineage(DatasetId sourceData, long startMillis, long endMillis, int levels, @Nullable String rollup) {
    LOG.trace("Computing lineage for data {}, startMillis {}, endMillis {}, levels {}", sourceData, startMillis, endMillis, levels);
    boolean rollUpWorkflow = rollup != null && rollup.contains("workflow");
    // Convert start time and end time period into scan keys in terms of program start times.
    Set<RunId> runningInRange = store.getRunningInRange(TimeUnit.MILLISECONDS.toSeconds(startMillis), TimeUnit.MILLISECONDS.toSeconds(endMillis));
    LOG.trace("Got {} rundIds in time range ({}, {})", runningInRange.size(), startMillis, endMillis);
    ScanRangeWithFilter scanRange = getScanRange(runningInRange);
    LOG.trace("Using scan start = {}, scan end = {}", scanRange.getStart(), scanRange.getEnd());
    Multimap<RelationKey, Relation> relations = HashMultimap.create();
    Set<DatasetId> visitedDatasets = new HashSet<>();
    Set<DatasetId> toVisitDatasets = new HashSet<>();
    Set<ProgramId> visitedPrograms = new HashSet<>();
    Set<ProgramId> toVisitPrograms = new HashSet<>();
    // this map is to map the inner program run id to the workflow run id, this is needed to collapse the inner
    // program and local datasets
    Map<ProgramRunId, ProgramRunId> programWorkflowMap = new HashMap<>();
    toVisitDatasets.add(sourceData);
    for (int i = 0; i < levels; ++i) {
        LOG.trace("Level {}", i);
        toVisitPrograms.clear();
        for (DatasetId d : toVisitDatasets) {
            if (visitedDatasets.add(d)) {
                LOG.trace("Visiting dataset {}", d);
                // Fetch related programs, the programs will be the inner programs which access the datasets. For example,
                // mapreduce or spark program in a workflow
                Set<Relation> programRelations = lineageStoreReader.getRelations(d, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
                LOG.trace("Got program relations {}", programRelations);
                // determine if a dataset is local dataset. The local dataset always ends with the workflow run id
                if (rollUpWorkflow) {
                    computeWorkflowInnerPrograms(toVisitPrograms, programWorkflowMap, programRelations);
                }
                // add to the relations, replace the inner program with the workflow using the map, ignore the
                // local datasets relations, the local dataset always ends with the run id of the workflow
                filterAndAddRelations(rollUpWorkflow, relations, programWorkflowMap, programRelations);
                toVisitPrograms.addAll(programRelations.stream().map(Relation::getProgram).collect(Collectors.toSet()));
            }
        }
        toVisitDatasets.clear();
        for (ProgramId p : toVisitPrograms) {
            if (visitedPrograms.add(p)) {
                LOG.trace("Visiting program {}", p);
                // Fetch related datasets
                Set<Relation> datasetRelations = lineageStoreReader.getRelations(p, scanRange.getStart(), scanRange.getEnd(), scanRange.getFilter());
                LOG.trace("Got data relations {}", datasetRelations);
                Set<DatasetId> localDatasets = filterAndAddRelations(rollUpWorkflow, relations, programWorkflowMap, datasetRelations);
                toVisitDatasets.addAll(datasetRelations.stream().map(relation -> (DatasetId) relation.getData()).filter(datasetId -> !localDatasets.contains(datasetId)).collect(Collectors.toSet()));
            }
        }
    }
    Lineage lineage = new Lineage(Iterables.concat(Maps.transformValues(relations.asMap(), COLLAPSE_UNKNOWN_TYPE_FUNCTION::apply).values()));
    LOG.trace("Got lineage {}", lineage);
    return lineage;
}
Also used : DefaultLineageStoreReader(io.cdap.cdap.data2.metadata.lineage.DefaultLineageStoreReader) RunRecordDetail(io.cdap.cdap.internal.app.store.RunRecordDetail) Iterables(com.google.common.collect.Iterables) WorkflowId(io.cdap.cdap.proto.id.WorkflowId) WorkflowSpecification(io.cdap.cdap.api.workflow.WorkflowSpecification) Inject(com.google.inject.Inject) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) Collections2(com.google.common.collect.Collections2) Multimap(com.google.common.collect.Multimap) ProgramType(io.cdap.cdap.proto.ProgramType) Function(java.util.function.Function) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) HashSet(java.util.HashSet) WorkflowNode(io.cdap.cdap.api.workflow.WorkflowNode) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) HashMultimap(com.google.common.collect.HashMultimap) DatasetId(io.cdap.cdap.proto.id.DatasetId) Map(java.util.Map) RunId(org.apache.twill.api.RunId) WorkflowActionNode(io.cdap.cdap.api.workflow.WorkflowActionNode) AccessType(io.cdap.cdap.data2.metadata.lineage.AccessType) Nullable(javax.annotation.Nullable) Logger(org.slf4j.Logger) RunIds(io.cdap.cdap.common.app.RunIds) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) Predicate(java.util.function.Predicate) Collection(java.util.Collection) ApplicationSpecification(io.cdap.cdap.api.app.ApplicationSpecification) ProgramId(io.cdap.cdap.proto.id.ProgramId) Set(java.util.Set) Maps(com.google.common.collect.Maps) Collectors(java.util.stream.Collectors) Store(io.cdap.cdap.app.store.Store) Objects(java.util.Objects) TimeUnit(java.util.concurrent.TimeUnit) VisibleForTesting(com.google.common.annotations.VisibleForTesting) LineageStoreReader(io.cdap.cdap.data2.metadata.lineage.LineageStoreReader) ProgramOptionConstants(io.cdap.cdap.internal.app.runtime.ProgramOptionConstants) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) HashMap(java.util.HashMap) Lineage(io.cdap.cdap.data2.metadata.lineage.Lineage) ProgramId(io.cdap.cdap.proto.id.ProgramId) DatasetId(io.cdap.cdap.proto.id.DatasetId) Relation(io.cdap.cdap.data2.metadata.lineage.Relation) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) RunId(org.apache.twill.api.RunId) HashSet(java.util.HashSet)

Example 17 with ProgramRunId

use of io.cdap.cdap.proto.id.ProgramRunId in project cdap by caskdata.

the class SparkTwillRunnableModuleTest method testSpark.

@Test
public void testSpark() {
    ProgramRunId programRunId = NamespaceId.DEFAULT.app("test").spark("spark").run(RunIds.generate());
    for (ClusterMode mode : ClusterMode.values()) {
        Module module = new SparkTwillRunnable("spark") {

            @Override
            protected ServiceAnnouncer getServiceAnnouncer() {
                return new MockTwillContext();
            }
        }.createModule(CConfiguration.create(), new Configuration(), createProgramOptions(programRunId, mode), programRunId);
        Injector injector = Guice.createInjector(module);
        injector.getInstance(SparkProgramRunner.class);
        injector.getInstance(ExploreClient.class);
        Injector contextInjector = SparkRuntimeContextProvider.createInjector(CConfiguration.create(), new Configuration(), programRunId.getParent(), createProgramOptions(programRunId, mode));
        contextInjector.getInstance(PluginFinder.class);
        contextInjector.getInstance(ExploreClient.class);
    }
}
Also used : MockTwillContext(io.cdap.cdap.common.test.MockTwillContext) CConfiguration(io.cdap.cdap.common.conf.CConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ClusterMode(io.cdap.cdap.app.guice.ClusterMode) Injector(com.google.inject.Injector) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) Module(com.google.inject.Module) ServiceAnnouncer(org.apache.twill.api.ServiceAnnouncer) Test(org.junit.Test)

Example 18 with ProgramRunId

use of io.cdap.cdap.proto.id.ProgramRunId in project cdap by caskdata.

the class ProgramTwillRunnableModuleTest method testService.

@Test
public void testService() {
    ProgramRunId programRunId = NamespaceId.DEFAULT.app("test").service("service").run(RunIds.generate());
    for (ClusterMode mode : ClusterMode.values()) {
        Module module = new ServiceTwillRunnable("service") {

            @Override
            protected ServiceAnnouncer getServiceAnnouncer() {
                return new MockTwillContext();
            }
        }.createModule(CConfiguration.create(), new Configuration(), createProgramOptions(programRunId, mode), programRunId);
        Injector injector = Guice.createInjector(module);
        injector.getInstance(ServiceProgramRunner.class);
        injector.getInstance(ExploreClient.class);
    }
}
Also used : MockTwillContext(io.cdap.cdap.common.test.MockTwillContext) Configuration(org.apache.hadoop.conf.Configuration) CConfiguration(io.cdap.cdap.common.conf.CConfiguration) ClusterMode(io.cdap.cdap.app.guice.ClusterMode) Injector(com.google.inject.Injector) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) Module(com.google.inject.Module) ServiceAnnouncer(org.apache.twill.api.ServiceAnnouncer) Test(org.junit.Test)

Example 19 with ProgramRunId

use of io.cdap.cdap.proto.id.ProgramRunId in project cdap by caskdata.

the class ProgramTwillRunnableModuleTest method testWorkflow.

@Test
public void testWorkflow() {
    ProgramRunId programRunId = NamespaceId.DEFAULT.app("test").workflow("workflow").run(RunIds.generate());
    for (ClusterMode mode : ClusterMode.values()) {
        Module module = new WorkflowTwillRunnable("workflow").createModule(CConfiguration.create(), new Configuration(), createProgramOptions(programRunId, mode), programRunId);
        Injector injector = Guice.createInjector(module);
        injector.getInstance(WorkflowProgramRunner.class);
        // Workflow supports spark, which supports PluginFinder
        injector.getInstance(PluginFinder.class);
        injector.getInstance(ExploreClient.class);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) CConfiguration(io.cdap.cdap.common.conf.CConfiguration) ClusterMode(io.cdap.cdap.app.guice.ClusterMode) Injector(com.google.inject.Injector) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) Module(com.google.inject.Module) Test(org.junit.Test)

Example 20 with ProgramRunId

use of io.cdap.cdap.proto.id.ProgramRunId in project cdap by caskdata.

the class ProgramTwillRunnableModuleTest method testMapReduce.

@Test
public void testMapReduce() {
    ProgramRunId programRunId = NamespaceId.DEFAULT.app("test").mr("mapreduce").run(RunIds.generate());
    for (ClusterMode mode : ClusterMode.values()) {
        Module module = new MapReduceTwillRunnable("mapreduce").createModule(CConfiguration.create(), new Configuration(), createProgramOptions(programRunId, mode), programRunId);
        Injector injector = Guice.createInjector(module);
        injector.getInstance(MapReduceProgramRunner.class);
        injector.getInstance(ExploreClient.class);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) CConfiguration(io.cdap.cdap.common.conf.CConfiguration) ClusterMode(io.cdap.cdap.app.guice.ClusterMode) Injector(com.google.inject.Injector) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) Module(com.google.inject.Module) Test(org.junit.Test)

Aggregations

ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)158 Test (org.junit.Test)85 ProgramId (io.cdap.cdap.proto.id.ProgramId)62 RunId (org.apache.twill.api.RunId)38 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)37 ArrayList (java.util.ArrayList)29 RunRecordDetail (io.cdap.cdap.internal.app.store.RunRecordDetail)25 HashMap (java.util.HashMap)25 ArtifactId (io.cdap.cdap.api.artifact.ArtifactId)24 CConfiguration (io.cdap.cdap.common.conf.CConfiguration)24 NamespaceId (io.cdap.cdap.proto.id.NamespaceId)24 ApplicationSpecification (io.cdap.cdap.api.app.ApplicationSpecification)23 IOException (java.io.IOException)22 HashSet (java.util.HashSet)21 ProgramOptions (io.cdap.cdap.app.runtime.ProgramOptions)20 ProgramRunStatus (io.cdap.cdap.proto.ProgramRunStatus)19 Map (java.util.Map)19 SimpleProgramOptions (io.cdap.cdap.internal.app.runtime.SimpleProgramOptions)18 Injector (com.google.inject.Injector)15 TimeUnit (java.util.concurrent.TimeUnit)15