Search in sources :

Example 1 with OpenLineage

use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.

the class OpenLineageRunEventBuilder method buildInputDatasets.

private List<OpenLineage.InputDataset> buildInputDatasets(List<Object> nodes) {
    openLineageContext.getQueryExecution().ifPresent(qe -> {
        if (log.isDebugEnabled()) {
            log.debug("Traversing optimized plan {}", qe.optimizedPlan().toJSON());
            log.debug("Physical plan executed {}", qe.executedPlan().toJSON());
        }
    });
    log.info("Visiting query plan {} with input dataset builders {}", openLineageContext.getQueryExecution(), inputDatasetBuilders);
    Function1<LogicalPlan, Collection<InputDataset>> inputVisitor = visitLogicalPlan(PlanUtils.merge(inputDatasetQueryPlanVisitors));
    List<OpenLineage.InputDataset> datasets = Stream.concat(buildDatasets(nodes, inputDatasetBuilders), openLineageContext.getQueryExecution().map(qe -> fromSeq(qe.optimizedPlan().map(inputVisitor)).stream().flatMap(Collection::stream).map(((Class<InputDataset>) InputDataset.class)::cast)).orElse(Stream.empty())).collect(Collectors.toList());
    OpenLineage openLineage = openLineageContext.getOpenLineage();
    if (!datasets.isEmpty()) {
        Map<String, InputDatasetFacet> inputFacetsMap = new HashMap<>();
        nodes.forEach(event -> inputDatasetFacetBuilders.forEach(fn -> fn.accept(event, inputFacetsMap::put)));
        Map<String, DatasetFacets> datasetFacetsMap = new HashMap<>();
        nodes.forEach(event -> inputDatasetFacetBuilders.forEach(fn -> fn.accept(event, inputFacetsMap::put)));
        return datasets.stream().map(ds -> openLineage.newInputDatasetBuilder().name(ds.getName()).namespace(ds.getNamespace()).inputFacets(mergeFacets(inputFacetsMap, ds.getInputFacets(), InputDatasetInputFacets.class)).facets(mergeFacets(datasetFacetsMap, ds.getFacets(), DatasetFacets.class)).build()).collect(Collectors.toList());
    }
    return datasets;
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) Arrays(java.util.Arrays) InputDataset(io.openlineage.client.OpenLineage.InputDataset) RunFacetsBuilder(io.openlineage.client.OpenLineage.RunFacetsBuilder) RunEventBuilder(io.openlineage.client.OpenLineage.RunEventBuilder) DatasetFacets(io.openlineage.client.OpenLineage.DatasetFacets) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) Map(java.util.Map) JobFacet(io.openlineage.client.OpenLineage.JobFacet) SparkListenerSQLExecutionStart(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart) JsonAnyGetter(com.fasterxml.jackson.annotation.JsonAnyGetter) TypeReference(com.fasterxml.jackson.core.type.TypeReference) JsonDeserializer(com.fasterxml.jackson.databind.JsonDeserializer) Method(java.lang.reflect.Method) RunEvent(io.openlineage.client.OpenLineage.RunEvent) Stage(org.apache.spark.scheduler.Stage) ScalaConversionUtils.toScalaFn(io.openlineage.spark.agent.util.ScalaConversionUtils.toScalaFn) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) PartialFunction(scala.PartialFunction) NonNull(lombok.NonNull) Collection(java.util.Collection) DatasetFacet(io.openlineage.client.OpenLineage.DatasetFacet) Collectors(java.util.stream.Collectors) IntrospectionException(java.beans.IntrospectionException) OutputDatasetOutputFacets(io.openlineage.client.OpenLineage.OutputDatasetOutputFacets) PlanUtils(io.openlineage.spark.agent.util.PlanUtils) JsonAnySetter(com.fasterxml.jackson.annotation.JsonAnySetter) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) Stream(java.util.stream.Stream) CustomFacetBuilder(io.openlineage.spark.api.CustomFacetBuilder) Type(java.lang.reflect.Type) PropertyDescriptor(java.beans.PropertyDescriptor) Optional(java.util.Optional) RDD(org.apache.spark.rdd.RDD) RunFacet(io.openlineage.client.OpenLineage.RunFacet) JobFailed(org.apache.spark.scheduler.JobFailed) OutputDatasetFacet(io.openlineage.client.OpenLineage.OutputDatasetFacet) ParentRunFacet(io.openlineage.client.OpenLineage.ParentRunFacet) Function1(scala.Function1) HashMap(java.util.HashMap) InputDatasetInputFacets(io.openlineage.client.OpenLineage.InputDatasetInputFacets) ScalaConversionUtils.fromSeq(io.openlineage.spark.agent.util.ScalaConversionUtils.fromSeq) ArrayList(java.util.ArrayList) Introspector(java.beans.Introspector) RunFacets(io.openlineage.client.OpenLineage.RunFacets) DeserializationProblemHandler(com.fasterxml.jackson.databind.deser.DeserializationProblemHandler) JobBuilder(io.openlineage.client.OpenLineage.JobBuilder) BeanInfo(java.beans.BeanInfo) SparkListenerJobEnd(org.apache.spark.scheduler.SparkListenerJobEnd) SparkListenerSQLExecutionEnd(org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd) DeserializationContext(com.fasterxml.jackson.databind.DeserializationContext) JsonParser(com.fasterxml.jackson.core.JsonParser) OpenLineageContext(io.openlineage.spark.api.OpenLineageContext) ActiveJob(org.apache.spark.scheduler.ActiveJob) SparkListenerJobStart(org.apache.spark.scheduler.SparkListenerJobStart) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) InputDatasetFacet(io.openlineage.client.OpenLineage.InputDatasetFacet) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) SparkListenerStageSubmitted(org.apache.spark.scheduler.SparkListenerStageSubmitted) OpenLineageEventHandlerFactory(io.openlineage.spark.api.OpenLineageEventHandlerFactory) ParameterizedType(java.lang.reflect.ParameterizedType) AllArgsConstructor(lombok.AllArgsConstructor) SparkListenerStageCompleted(org.apache.spark.scheduler.SparkListenerStageCompleted) OpenLineage(io.openlineage.client.OpenLineage) Collections(java.util.Collections) HashMap(java.util.HashMap) DatasetFacets(io.openlineage.client.OpenLineage.DatasetFacets) InputDatasetFacet(io.openlineage.client.OpenLineage.InputDatasetFacet) InputDataset(io.openlineage.client.OpenLineage.InputDataset) OpenLineage(io.openlineage.client.OpenLineage) Collection(java.util.Collection) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) InputDatasetInputFacets(io.openlineage.client.OpenLineage.InputDatasetInputFacets)

Example 2 with OpenLineage

use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.

the class RddExecutionContext method start.

@Override
public void start(SparkListenerJobStart jobStart) {
    OpenLineage ol = new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI);
    OpenLineage.RunEvent event = ol.newRunEventBuilder().eventTime(toZonedTime(jobStart.time())).eventType(OpenLineage.RunEvent.EventType.START).inputs(buildInputs(inputs)).outputs(buildOutputs(outputs)).run(ol.newRunBuilder().runId(runId).facets(buildRunFacets(null)).build()).job(buildJob(jobStart.jobId())).build();
    log.debug("Posting event for start {}: {}", jobStart, event);
    sparkContext.emit(event);
}
Also used : OpenLineage(io.openlineage.client.OpenLineage)

Example 3 with OpenLineage

use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.

the class SparkReadWriteIntegTest method testBigQueryReadWriteToFile.

@Test
public void testBigQueryReadWriteToFile(@TempDir Path writeDir, SparkSession spark) throws InterruptedException, TimeoutException {
    TableId tableId = TableId.of("testproject", "dataset", "MyTable");
    BigQuery bq = MockBigQueryRelationProvider.BIG_QUERY;
    StructType tableSchema = new StructType(new StructField[] { new StructField("name", StringType$.MODULE$, false, Metadata.empty()), new StructField("age", LongType$.MODULE$, false, Metadata.empty()) });
    MockBigQueryRelationProvider.INJECTOR.setTestModule(new Module() {

        @Override
        public void configure(Binder binder) {
        }

        @Provides
        public Dataset<Row> testData() {
            return spark.createDataFrame(Arrays.asList(new GenericRowWithSchema(new Object[] { "john", 25L }, tableSchema), new GenericRowWithSchema(new Object[] { "sam", 22L }, tableSchema), new GenericRowWithSchema(new Object[] { "alicia", 35L }, tableSchema), new GenericRowWithSchema(new Object[] { "bob", 47L }, tableSchema), new GenericRowWithSchema(new Object[] { "jordan", 52L }, tableSchema), new GenericRowWithSchema(new Object[] { "liz", 19L }, tableSchema), new GenericRowWithSchema(new Object[] { "marcia", 83L }, tableSchema), new GenericRowWithSchema(new Object[] { "maria", 40L }, tableSchema), new GenericRowWithSchema(new Object[] { "luis", 8L }, tableSchema), new GenericRowWithSchema(new Object[] { "gabriel", 30L }, tableSchema)), tableSchema);
        }
    });
    when(bq.getTable(eq(tableId))).thenAnswer(invocation -> MockBigQueryRelationProvider.makeTable(tableId, StandardTableDefinition.newBuilder().setSchema(Schema.of(Field.of("name", StandardSQLTypeName.STRING), Field.of("age", StandardSQLTypeName.INT64))).setNumBytes(100L).setNumRows(1000L).build()));
    Dataset<Row> df = spark.read().format(MockBigQueryRelationProvider.class.getName()).option("gcpAccessToken", "not a real access token").option("parentProject", "not a project").load("testproject.dataset.MyTable");
    String outputDir = writeDir.resolve("testBigQueryRead").toAbsolutePath().toUri().getPath();
    df.write().csv("file://" + outputDir);
    // wait for event processing to complete
    StaticExecutionContextFactory.waitForExecutionEnd();
    ArgumentCaptor<OpenLineage.RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
    Mockito.verify(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT, times(4)).emit(lineageEvent.capture());
    List<OpenLineage.RunEvent> events = lineageEvent.getAllValues();
    assertThat(events.get(2).getRun().getFacets().getAdditionalProperties()).hasEntrySatisfying(TestOpenLineageEventHandlerFactory.TEST_FACET_KEY, facet -> assertThat(facet).isInstanceOf(DefaultRunFacet.class).extracting("additionalProperties", InstanceOfAssertFactories.map(String.class, Object.class)).containsKey("message"));
    List<OpenLineage.InputDataset> inputs = events.get(2).getInputs();
    assertEquals(1, inputs.size());
    assertEquals("bigquery", inputs.get(0).getNamespace());
    assertEquals(BigQueryUtil.friendlyTableName(tableId), inputs.get(0).getName());
    List<OpenLineage.OutputDataset> outputs = events.get(2).getOutputs();
    assertEquals(1, outputs.size());
    OpenLineage.OutputDataset output = outputs.get(0);
    assertEquals("file", output.getNamespace());
    assertEquals(outputDir, output.getName());
    OpenLineage.SchemaDatasetFacet schemaDatasetFacet = PlanUtils.schemaFacet(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI), tableSchema);
    assertThat(output.getFacets().getSchema()).usingRecursiveComparison().isEqualTo(schemaDatasetFacet);
    assertNotNull(output.getFacets().getAdditionalProperties());
    assertThat(output.getOutputFacets().getOutputStatistics()).isNotNull();
}
Also used : TableId(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.TableId) StructType(org.apache.spark.sql.types.StructType) DefaultRunFacet(io.openlineage.client.OpenLineage.DefaultRunFacet) Binder(com.google.cloud.spark.bigquery.repackaged.com.google.inject.Binder) StructField(org.apache.spark.sql.types.StructField) BigQuery(com.google.cloud.spark.bigquery.repackaged.com.google.cloud.bigquery.BigQuery) InputDataset(io.openlineage.client.OpenLineage.InputDataset) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) Dataset(org.apache.spark.sql.Dataset) Provides(com.google.cloud.spark.bigquery.repackaged.com.google.inject.Provides) InputDataset(io.openlineage.client.OpenLineage.InputDataset) GenericRowWithSchema(org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema) OpenLineage(io.openlineage.client.OpenLineage) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) GenericRow(org.apache.spark.sql.catalyst.expressions.GenericRow) Row(org.apache.spark.sql.Row) Module(com.google.cloud.spark.bigquery.repackaged.com.google.inject.Module) RunEvent(io.openlineage.client.OpenLineage.RunEvent) Test(org.junit.jupiter.api.Test)

Example 4 with OpenLineage

use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.

the class LogicalRDDVisitorTest method testApply.

@Test
public void testApply(@TempDir Path tmpDir) {
    SparkSession session = SparkSession.builder().master("local").getOrCreate();
    LogicalRDDVisitor visitor = new LogicalRDDVisitor(SparkAgentTestExtension.newContext(session), DatasetFactory.output(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)));
    StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
    jobConf = new JobConf();
    FileInputFormat.addInputPath(jobConf, new org.apache.hadoop.fs.Path("file://" + tmpDir));
    RDD<InternalRow> hadoopRdd = new HadoopRDD<>(session.sparkContext(), jobConf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD().map(t -> (InternalRow) new GenericInternalRow(new Object[] { t._2.toString() })).rdd();
    LogicalRDD logicalRDD = new LogicalRDD(ScalaConversionUtils.fromSeq(schema.toAttributes()).stream().map(AttributeReference::toAttribute).collect(ScalaConversionUtils.toSeq()), hadoopRdd, SinglePartition$.MODULE$, Seq$.MODULE$.<SortOrder>empty(), false, session);
    assertThat(visitor.isDefinedAt(logicalRDD)).isTrue();
    List<OpenLineage.Dataset> datasets = visitor.apply(logicalRDD);
    assertThat(datasets).singleElement().hasFieldOrPropertyWithValue("name", tmpDir.toString()).hasFieldOrPropertyWithValue("namespace", "file");
}
Also used : OpenLineageClient(io.openlineage.spark.agent.client.OpenLineageClient) Seq$(scala.collection.Seq$) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) InternalRow(org.apache.spark.sql.catalyst.InternalRow) SinglePartition$(org.apache.spark.sql.catalyst.plans.physical.SinglePartition$) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) SparkAgentTestExtension(io.openlineage.spark.agent.SparkAgentTestExtension) ExtendWith(org.junit.jupiter.api.extension.ExtendWith) HadoopRDD(org.apache.spark.rdd.HadoopRDD) Path(java.nio.file.Path) SparkSession(org.apache.spark.sql.SparkSession) Metadata(org.apache.spark.sql.types.Metadata) StringType$(org.apache.spark.sql.types.StringType$) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) IntegerType$(org.apache.spark.sql.types.IntegerType$) SparkSession$(org.apache.spark.sql.SparkSession$) DatasetFactory(io.openlineage.spark.api.DatasetFactory) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.jupiter.api.Test) List(java.util.List) AfterEach(org.junit.jupiter.api.AfterEach) SortOrder(org.apache.spark.sql.catalyst.expressions.SortOrder) TempDir(org.junit.jupiter.api.io.TempDir) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) HashMap(scala.collection.immutable.HashMap) OpenLineage(io.openlineage.client.OpenLineage) RDD(org.apache.spark.rdd.RDD) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) AttributeReference(org.apache.spark.sql.catalyst.expressions.AttributeReference) Metadata(org.apache.spark.sql.types.Metadata) Text(org.apache.hadoop.io.Text) StructField(org.apache.spark.sql.types.StructField) LogicalRDD(org.apache.spark.sql.execution.LogicalRDD) TextInputFormat(org.apache.hadoop.mapred.TextInputFormat) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) OpenLineage(io.openlineage.client.OpenLineage) LongWritable(org.apache.hadoop.io.LongWritable) JobConf(org.apache.hadoop.mapred.JobConf) InternalRow(org.apache.spark.sql.catalyst.InternalRow) GenericInternalRow(org.apache.spark.sql.catalyst.expressions.GenericInternalRow) Test(org.junit.jupiter.api.Test)

Example 5 with OpenLineage

use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.

the class LogicalRelationDatasetBuilderTest method testApplyForHadoopFsRelation.

@Test
void testApplyForHadoopFsRelation() {
    HadoopFsRelation hadoopFsRelation = mock(HadoopFsRelation.class);
    LogicalRelation logicalRelation = mock(LogicalRelation.class);
    Configuration hadoopConfig = mock(Configuration.class);
    SparkContext sparkContext = mock(SparkContext.class);
    FileIndex fileIndex = mock(FileIndex.class);
    OpenLineage openLineage = mock(OpenLineage.class);
    SessionState sessionState = mock(SessionState.class);
    Path p1 = new Path("/tmp/path1");
    Path p2 = new Path("/tmp/path2");
    when(logicalRelation.relation()).thenReturn(hadoopFsRelation);
    when(openLineageContext.getSparkContext()).thenReturn(sparkContext);
    when(openLineageContext.getSparkSession()).thenReturn(Optional.of(session));
    when(openLineageContext.getOpenLineage()).thenReturn(openLineage);
    when(openLineage.newDatasetFacetsBuilder()).thenReturn(new OpenLineage.DatasetFacetsBuilder());
    when(session.sessionState()).thenReturn(sessionState);
    when(sessionState.newHadoopConfWithOptions(any())).thenReturn(hadoopConfig);
    when(hadoopFsRelation.location()).thenReturn(fileIndex);
    when(fileIndex.rootPaths()).thenReturn(scala.collection.JavaConverters.collectionAsScalaIterableConverter(Arrays.asList(p1, p2)).asScala().toSeq());
    try (MockedStatic mocked = mockStatic(PlanUtils.class)) {
        when(PlanUtils.getDirectoryPath(p1, hadoopConfig)).thenReturn(new Path("/tmp"));
        when(PlanUtils.getDirectoryPath(p2, hadoopConfig)).thenReturn(new Path("/tmp"));
        List<OpenLineage.Dataset> datasets = builder.apply(logicalRelation);
        assertEquals(1, datasets.size());
        OpenLineage.Dataset ds = datasets.get(0);
        assertEquals("/tmp", ds.getName());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SessionState(org.apache.spark.sql.internal.SessionState) Configuration(org.apache.hadoop.conf.Configuration) MockedStatic(org.mockito.MockedStatic) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) FileIndex(org.apache.spark.sql.execution.datasources.FileIndex) SparkContext(org.apache.spark.SparkContext) OpenLineage(io.openlineage.client.OpenLineage) HadoopFsRelation(org.apache.spark.sql.execution.datasources.HadoopFsRelation) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

OpenLineage (io.openlineage.client.OpenLineage)38 Test (org.junit.jupiter.api.Test)23 SparkListenerJobEnd (org.apache.spark.scheduler.SparkListenerJobEnd)12 SparkListenerJobStart (org.apache.spark.scheduler.SparkListenerJobStart)9 SparkListenerSQLExecutionEnd (org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd)9 InputDataset (io.openlineage.client.OpenLineage.InputDataset)7 OpenLineageContext (io.openlineage.spark.api.OpenLineageContext)7 LogicalRelation (org.apache.spark.sql.execution.datasources.LogicalRelation)7 SparkListenerSQLExecutionStart (org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart)7 OutputDataset (io.openlineage.client.OpenLineage.OutputDataset)6 HashMap (java.util.HashMap)6 SparkSession (org.apache.spark.sql.SparkSession)6 AttributeReference (org.apache.spark.sql.catalyst.expressions.AttributeReference)6 RunFacet (io.openlineage.client.OpenLineage.RunFacet)5 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)4 RunEvent (io.openlineage.client.OpenLineage.RunEvent)4 SparkListenerStageCompleted (org.apache.spark.scheduler.SparkListenerStageCompleted)4 JsonAnyGetter (com.fasterxml.jackson.annotation.JsonAnyGetter)3 JsonAnySetter (com.fasterxml.jackson.annotation.JsonAnySetter)3 JsonParser (com.fasterxml.jackson.core.JsonParser)3