Search in sources :

Example 21 with OpenLineage

use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.

the class OutputStatisticsOutputDatasetFacetBuilderTest method testBuild.

@Test
public void testBuild() {
    OutputStatisticsOutputDatasetFacetBuilder builder = new OutputStatisticsOutputDatasetFacetBuilder(OpenLineageContext.builder().openLineage(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)).sparkContext(sparkContext).build());
    JobMetricsHolder.getInstance().addJobStages(1, Collections.singleton(1));
    TaskMetrics taskMetrics = new TaskMetrics();
    taskMetrics.outputMetrics().setBytesWritten(10L);
    taskMetrics.outputMetrics().setRecordsWritten(100L);
    JobMetricsHolder.getInstance().addMetrics(1, taskMetrics);
    Map<String, OutputDatasetFacet> facetsMap = new HashMap<>();
    builder.build(new SparkListenerJobEnd(1, 1L, JobSucceeded$.MODULE$), facetsMap::put);
    assertThat(facetsMap).hasEntrySatisfying("outputStatistics", facet -> assertThat(facet).hasFieldOrPropertyWithValue("rowCount", 100L).hasFieldOrPropertyWithValue("size", 10L));
}
Also used : TaskMetrics(org.apache.spark.executor.TaskMetrics) HashMap(java.util.HashMap) SparkListenerJobEnd(org.apache.spark.scheduler.SparkListenerJobEnd) OpenLineage(io.openlineage.client.OpenLineage) OutputDatasetFacet(io.openlineage.client.OpenLineage.OutputDatasetFacet) Test(org.junit.jupiter.api.Test)

Example 22 with OpenLineage

use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.

the class SparkVersionFacetBuilderTest method testIsDefinedForSparkListenerEvents.

@Test
public void testIsDefinedForSparkListenerEvents() {
    SparkVersionFacetBuilder builder = new SparkVersionFacetBuilder(OpenLineageContext.builder().sparkContext(sparkContext).openLineage(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)).build());
    assertThat(builder.isDefinedAt(new SparkListenerSQLExecutionEnd(1, 1L))).isTrue();
    assertThat(builder.isDefinedAt(new SparkListenerSQLExecutionStart(1L, "abc", "abc", "abc", null, 1L))).isTrue();
    assertThat(builder.isDefinedAt(new SparkListenerJobStart(1, 1L, Seq$.MODULE$.empty(), new Properties()))).isTrue();
    assertThat(builder.isDefinedAt(new SparkListenerJobEnd(1, 1L, JobSucceeded$.MODULE$))).isTrue();
    assertThat(builder.isDefinedAt(new SparkListenerStageSubmitted(null, new Properties()))).isTrue();
    assertThat(builder.isDefinedAt(new SparkListenerStageCompleted(null))).isTrue();
}
Also used : SparkListenerSQLExecutionStart(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart) SparkListenerSQLExecutionEnd(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd) SparkListenerStageSubmitted(org.apache.spark.scheduler.SparkListenerStageSubmitted) SparkListenerJobStart(org.apache.spark.scheduler.SparkListenerJobStart) SparkListenerJobEnd(org.apache.spark.scheduler.SparkListenerJobEnd) OpenLineage(io.openlineage.client.OpenLineage) Properties(java.util.Properties) SparkListenerStageCompleted(org.apache.spark.scheduler.SparkListenerStageCompleted) Test(org.junit.jupiter.api.Test)

Example 23 with OpenLineage

use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.

the class InternalEventHandlerFactoryTest method setup.

@BeforeAll
public static void setup() {
    sparkContext = SparkContext.getOrCreate(new SparkConf().setAppName("InternalEventHandlerFactoryTest").setMaster("local"));
    context = OpenLineageContext.builder().sparkContext(sparkContext).openLineage(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)).build();
}
Also used : OpenLineage(io.openlineage.client.OpenLineage) SparkConf(org.apache.spark.SparkConf) BeforeAll(org.junit.jupiter.api.BeforeAll)

Example 24 with OpenLineage

use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.

the class OpenLineageRunEventBuilder method populateRun.

private RunEvent populateRun(Optional<ParentRunFacet> parentRunFacet, RunEventBuilder runEventBuilder, JobBuilder jobBuilder, List<Object> nodes) {
    OpenLineage openLineage = openLineageContext.getOpenLineage();
    RunFacetsBuilder runFacetsBuilder = openLineage.newRunFacetsBuilder();
    parentRunFacet.ifPresent(runFacetsBuilder::parent);
    OpenLineage.JobFacets jobFacets = buildFacets(nodes, jobFacetBuilders, openLineage.newJobFacetsBuilder().build());
    List<InputDataset> inputDatasets = buildInputDatasets(nodes);
    List<OutputDataset> outputDatasets = buildOutputDatasets(nodes);
    openLineageContext.getQueryExecution().flatMap(qe -> unknownEntryFacetListener.build(qe.optimizedPlan())).ifPresent(facet -> runFacetsBuilder.put("spark_unknown", facet));
    RunFacets runFacets = buildFacets(nodes, runFacetBuilders, runFacetsBuilder.build());
    OpenLineage.RunBuilder runBuilder = openLineage.newRunBuilder().runId(openLineageContext.getRunUuid()).facets(runFacets);
    return runEventBuilder.run(runBuilder.build()).job(jobBuilder.facets(jobFacets).build()).inputs(inputDatasets).outputs(outputDatasets).build();
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) Arrays(java.util.Arrays) InputDataset(io.openlineage.client.OpenLineage.InputDataset) RunFacetsBuilder(io.openlineage.client.OpenLineage.RunFacetsBuilder) RunEventBuilder(io.openlineage.client.OpenLineage.RunEventBuilder) DatasetFacets(io.openlineage.client.OpenLineage.DatasetFacets) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) Map(java.util.Map) JobFacet(io.openlineage.client.OpenLineage.JobFacet) SparkListenerSQLExecutionStart(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart) JsonAnyGetter(com.fasterxml.jackson.annotation.JsonAnyGetter) TypeReference(com.fasterxml.jackson.core.type.TypeReference) JsonDeserializer(com.fasterxml.jackson.databind.JsonDeserializer) Method(java.lang.reflect.Method) RunEvent(io.openlineage.client.OpenLineage.RunEvent) Stage(org.apache.spark.scheduler.Stage) ScalaConversionUtils.toScalaFn(io.openlineage.spark.agent.util.ScalaConversionUtils.toScalaFn) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) PartialFunction(scala.PartialFunction) NonNull(lombok.NonNull) Collection(java.util.Collection) DatasetFacet(io.openlineage.client.OpenLineage.DatasetFacet) Collectors(java.util.stream.Collectors) IntrospectionException(java.beans.IntrospectionException) OutputDatasetOutputFacets(io.openlineage.client.OpenLineage.OutputDatasetOutputFacets) PlanUtils(io.openlineage.spark.agent.util.PlanUtils) JsonAnySetter(com.fasterxml.jackson.annotation.JsonAnySetter) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) Stream(java.util.stream.Stream) CustomFacetBuilder(io.openlineage.spark.api.CustomFacetBuilder) Type(java.lang.reflect.Type) PropertyDescriptor(java.beans.PropertyDescriptor) Optional(java.util.Optional) RDD(org.apache.spark.rdd.RDD) RunFacet(io.openlineage.client.OpenLineage.RunFacet) JobFailed(org.apache.spark.scheduler.JobFailed) OutputDatasetFacet(io.openlineage.client.OpenLineage.OutputDatasetFacet) ParentRunFacet(io.openlineage.client.OpenLineage.ParentRunFacet) Function1(scala.Function1) HashMap(java.util.HashMap) InputDatasetInputFacets(io.openlineage.client.OpenLineage.InputDatasetInputFacets) ScalaConversionUtils.fromSeq(io.openlineage.spark.agent.util.ScalaConversionUtils.fromSeq) ArrayList(java.util.ArrayList) Introspector(java.beans.Introspector) RunFacets(io.openlineage.client.OpenLineage.RunFacets) DeserializationProblemHandler(com.fasterxml.jackson.databind.deser.DeserializationProblemHandler) JobBuilder(io.openlineage.client.OpenLineage.JobBuilder) BeanInfo(java.beans.BeanInfo) SparkListenerJobEnd(org.apache.spark.scheduler.SparkListenerJobEnd) SparkListenerSQLExecutionEnd(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd) DeserializationContext(com.fasterxml.jackson.databind.DeserializationContext) JsonParser(com.fasterxml.jackson.core.JsonParser) OpenLineageContext(io.openlineage.spark.api.OpenLineageContext) ActiveJob(org.apache.spark.scheduler.ActiveJob) SparkListenerJobStart(org.apache.spark.scheduler.SparkListenerJobStart) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) InputDatasetFacet(io.openlineage.client.OpenLineage.InputDatasetFacet) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) SparkListenerStageSubmitted(org.apache.spark.scheduler.SparkListenerStageSubmitted) OpenLineageEventHandlerFactory(io.openlineage.spark.api.OpenLineageEventHandlerFactory) ParameterizedType(java.lang.reflect.ParameterizedType) AllArgsConstructor(lombok.AllArgsConstructor) SparkListenerStageCompleted(org.apache.spark.scheduler.SparkListenerStageCompleted) OpenLineage(io.openlineage.client.OpenLineage) Collections(java.util.Collections) RunFacetsBuilder(io.openlineage.client.OpenLineage.RunFacetsBuilder) InputDataset(io.openlineage.client.OpenLineage.InputDataset) OpenLineage(io.openlineage.client.OpenLineage) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) RunFacets(io.openlineage.client.OpenLineage.RunFacets)

Example 25 with OpenLineage

use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.

the class OpenLineageRunEventBuilder method buildOutputDatasets.

private List<OpenLineage.OutputDataset> buildOutputDatasets(List<Object> nodes) {
    log.info("Visiting query plan {} with output dataset builders {}", openLineageContext.getQueryExecution(), outputDatasetBuilders);
    Function1<LogicalPlan, Collection<OutputDataset>> visitor = visitLogicalPlan(PlanUtils.merge(outputDatasetQueryPlanVisitors));
    List<OutputDataset> datasets = Stream.concat(buildDatasets(nodes, outputDatasetBuilders), openLineageContext.getQueryExecution().map(qe -> visitor.apply(qe.optimizedPlan())).map(Collection::stream).orElse(Stream.empty())).collect(Collectors.toList());
    OpenLineage openLineage = openLineageContext.getOpenLineage();
    if (!datasets.isEmpty()) {
        Map<String, OutputDatasetFacet> outputFacetsMap = new HashMap<>();
        nodes.forEach(event -> outputDatasetFacetBuilders.forEach(fn -> fn.accept(event, outputFacetsMap::put)));
        Map<String, DatasetFacet> datasetFacetsMap = new HashMap<>();
        nodes.forEach(event -> datasetFacetBuilders.forEach(fn -> fn.accept(event, datasetFacetsMap::put)));
        return datasets.stream().map(ds -> openLineage.newOutputDatasetBuilder().name(ds.getName()).namespace(ds.getNamespace()).outputFacets(mergeFacets(outputFacetsMap, ds.getOutputFacets(), OutputDatasetOutputFacets.class)).facets(mergeFacets(datasetFacetsMap, ds.getFacets(), DatasetFacets.class)).build()).collect(Collectors.toList());
    }
    return datasets;
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) Arrays(java.util.Arrays) InputDataset(io.openlineage.client.OpenLineage.InputDataset) RunFacetsBuilder(io.openlineage.client.OpenLineage.RunFacetsBuilder) RunEventBuilder(io.openlineage.client.OpenLineage.RunEventBuilder) DatasetFacets(io.openlineage.client.OpenLineage.DatasetFacets) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) Map(java.util.Map) JobFacet(io.openlineage.client.OpenLineage.JobFacet) SparkListenerSQLExecutionStart(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart) JsonAnyGetter(com.fasterxml.jackson.annotation.JsonAnyGetter) TypeReference(com.fasterxml.jackson.core.type.TypeReference) JsonDeserializer(com.fasterxml.jackson.databind.JsonDeserializer) Method(java.lang.reflect.Method) RunEvent(io.openlineage.client.OpenLineage.RunEvent) Stage(org.apache.spark.scheduler.Stage) ScalaConversionUtils.toScalaFn(io.openlineage.spark.agent.util.ScalaConversionUtils.toScalaFn) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) PartialFunction(scala.PartialFunction) NonNull(lombok.NonNull) Collection(java.util.Collection) DatasetFacet(io.openlineage.client.OpenLineage.DatasetFacet) Collectors(java.util.stream.Collectors) IntrospectionException(java.beans.IntrospectionException) OutputDatasetOutputFacets(io.openlineage.client.OpenLineage.OutputDatasetOutputFacets) PlanUtils(io.openlineage.spark.agent.util.PlanUtils) JsonAnySetter(com.fasterxml.jackson.annotation.JsonAnySetter) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) Stream(java.util.stream.Stream) CustomFacetBuilder(io.openlineage.spark.api.CustomFacetBuilder) Type(java.lang.reflect.Type) PropertyDescriptor(java.beans.PropertyDescriptor) Optional(java.util.Optional) RDD(org.apache.spark.rdd.RDD) RunFacet(io.openlineage.client.OpenLineage.RunFacet) JobFailed(org.apache.spark.scheduler.JobFailed) OutputDatasetFacet(io.openlineage.client.OpenLineage.OutputDatasetFacet) ParentRunFacet(io.openlineage.client.OpenLineage.ParentRunFacet) Function1(scala.Function1) HashMap(java.util.HashMap) InputDatasetInputFacets(io.openlineage.client.OpenLineage.InputDatasetInputFacets) ScalaConversionUtils.fromSeq(io.openlineage.spark.agent.util.ScalaConversionUtils.fromSeq) ArrayList(java.util.ArrayList) Introspector(java.beans.Introspector) RunFacets(io.openlineage.client.OpenLineage.RunFacets) DeserializationProblemHandler(com.fasterxml.jackson.databind.deser.DeserializationProblemHandler) JobBuilder(io.openlineage.client.OpenLineage.JobBuilder) BeanInfo(java.beans.BeanInfo) SparkListenerJobEnd(org.apache.spark.scheduler.SparkListenerJobEnd) SparkListenerSQLExecutionEnd(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd) DeserializationContext(com.fasterxml.jackson.databind.DeserializationContext) JsonParser(com.fasterxml.jackson.core.JsonParser) OpenLineageContext(io.openlineage.spark.api.OpenLineageContext) ActiveJob(org.apache.spark.scheduler.ActiveJob) SparkListenerJobStart(org.apache.spark.scheduler.SparkListenerJobStart) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) InputDatasetFacet(io.openlineage.client.OpenLineage.InputDatasetFacet) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) SparkListenerStageSubmitted(org.apache.spark.scheduler.SparkListenerStageSubmitted) OpenLineageEventHandlerFactory(io.openlineage.spark.api.OpenLineageEventHandlerFactory) ParameterizedType(java.lang.reflect.ParameterizedType) AllArgsConstructor(lombok.AllArgsConstructor) SparkListenerStageCompleted(org.apache.spark.scheduler.SparkListenerStageCompleted) OpenLineage(io.openlineage.client.OpenLineage) Collections(java.util.Collections) HashMap(java.util.HashMap) OutputDatasetOutputFacets(io.openlineage.client.OpenLineage.OutputDatasetOutputFacets) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) OpenLineage(io.openlineage.client.OpenLineage) Collection(java.util.Collection) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) DatasetFacet(io.openlineage.client.OpenLineage.DatasetFacet) OutputDatasetFacet(io.openlineage.client.OpenLineage.OutputDatasetFacet) InputDatasetFacet(io.openlineage.client.OpenLineage.InputDatasetFacet) OutputDatasetFacet(io.openlineage.client.OpenLineage.OutputDatasetFacet)

Aggregations

OpenLineage (io.openlineage.client.OpenLineage)38 Test (org.junit.jupiter.api.Test)23 SparkListenerJobEnd (org.apache.spark.scheduler.SparkListenerJobEnd)12 SparkListenerJobStart (org.apache.spark.scheduler.SparkListenerJobStart)9 SparkListenerSQLExecutionEnd (org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd)9 InputDataset (io.openlineage.client.OpenLineage.InputDataset)7 OpenLineageContext (io.openlineage.spark.api.OpenLineageContext)7 LogicalRelation (org.apache.spark.sql.execution.datasources.LogicalRelation)7 SparkListenerSQLExecutionStart (org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart)7 OutputDataset (io.openlineage.client.OpenLineage.OutputDataset)6 HashMap (java.util.HashMap)6 SparkSession (org.apache.spark.sql.SparkSession)6 AttributeReference (org.apache.spark.sql.catalyst.expressions.AttributeReference)6 RunFacet (io.openlineage.client.OpenLineage.RunFacet)5 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)4 RunEvent (io.openlineage.client.OpenLineage.RunEvent)4 SparkListenerStageCompleted (org.apache.spark.scheduler.SparkListenerStageCompleted)4 JsonAnyGetter (com.fasterxml.jackson.annotation.JsonAnyGetter)3 JsonAnySetter (com.fasterxml.jackson.annotation.JsonAnySetter)3 JsonParser (com.fasterxml.jackson.core.JsonParser)3