use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.
the class OpenLineageRunEventBuilder method buildInputDatasets.
private List<OpenLineage.InputDataset> buildInputDatasets(List<Object> nodes) {
openLineageContext.getQueryExecution().ifPresent(qe -> {
if (log.isDebugEnabled()) {
log.debug("Traversing optimized plan {}", qe.optimizedPlan().toJSON());
log.debug("Physical plan executed {}", qe.executedPlan().toJSON());
}
});
log.info("Visiting query plan {} with input dataset builders {}", openLineageContext.getQueryExecution(), inputDatasetBuilders);
Function1<LogicalPlan, Collection<InputDataset>> inputVisitor = visitLogicalPlan(PlanUtils.merge(inputDatasetQueryPlanVisitors));
List<OpenLineage.InputDataset> datasets = Stream.concat(buildDatasets(nodes, inputDatasetBuilders), openLineageContext.getQueryExecution().map(qe -> fromSeq(qe.optimizedPlan().map(inputVisitor)).stream().flatMap(Collection::stream).map(((Class<InputDataset>) InputDataset.class)::cast)).orElse(Stream.empty())).collect(Collectors.toList());
OpenLineage openLineage = openLineageContext.getOpenLineage();
if (!datasets.isEmpty()) {
Map<String, InputDatasetFacet> inputFacetsMap = new HashMap<>();
nodes.forEach(event -> inputDatasetFacetBuilders.forEach(fn -> fn.accept(event, inputFacetsMap::put)));
Map<String, DatasetFacets> datasetFacetsMap = new HashMap<>();
nodes.forEach(event -> inputDatasetFacetBuilders.forEach(fn -> fn.accept(event, inputFacetsMap::put)));
return datasets.stream().map(ds -> openLineage.newInputDatasetBuilder().name(ds.getName()).namespace(ds.getNamespace()).inputFacets(mergeFacets(inputFacetsMap, ds.getInputFacets(), InputDatasetInputFacets.class)).facets(mergeFacets(datasetFacetsMap, ds.getFacets(), DatasetFacets.class)).build()).collect(Collectors.toList());
}
return datasets;
}
use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.
the class RddExecutionContext method start.
@Override
public void start(SparkListenerJobStart jobStart) {
OpenLineage ol = new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI);
OpenLineage.RunEvent event = ol.newRunEventBuilder().eventTime(toZonedTime(jobStart.time())).eventType(OpenLineage.RunEvent.EventType.START).inputs(buildInputs(inputs)).outputs(buildOutputs(outputs)).run(ol.newRunBuilder().runId(runId).facets(buildRunFacets(null)).build()).job(buildJob(jobStart.jobId())).build();
log.debug("Posting event for start {}: {}", jobStart, event);
sparkContext.emit(event);
}
use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.
the class SparkReadWriteIntegTest method testBigQueryReadWriteToFile.
@Test
public void testBigQueryReadWriteToFile(@TempDir Path writeDir, SparkSession spark) throws InterruptedException, TimeoutException {
TableId tableId = TableId.of("testproject", "dataset", "MyTable");
BigQuery bq = MockBigQueryRelationProvider.BIG_QUERY;
StructType tableSchema = new StructType(new StructField[] { new StructField("name", StringType$.MODULE$, false, Metadata.empty()), new StructField("age", LongType$.MODULE$, false, Metadata.empty()) });
MockBigQueryRelationProvider.INJECTOR.setTestModule(new Module() {
@Override
public void configure(Binder binder) {
}
@Provides
public Dataset<Row> testData() {
return spark.createDataFrame(Arrays.asList(new GenericRowWithSchema(new Object[] { "john", 25L }, tableSchema), new GenericRowWithSchema(new Object[] { "sam", 22L }, tableSchema), new GenericRowWithSchema(new Object[] { "alicia", 35L }, tableSchema), new GenericRowWithSchema(new Object[] { "bob", 47L }, tableSchema), new GenericRowWithSchema(new Object[] { "jordan", 52L }, tableSchema), new GenericRowWithSchema(new Object[] { "liz", 19L }, tableSchema), new GenericRowWithSchema(new Object[] { "marcia", 83L }, tableSchema), new GenericRowWithSchema(new Object[] { "maria", 40L }, tableSchema), new GenericRowWithSchema(new Object[] { "luis", 8L }, tableSchema), new GenericRowWithSchema(new Object[] { "gabriel", 30L }, tableSchema)), tableSchema);
}
});
when(bq.getTable(eq(tableId))).thenAnswer(invocation -> MockBigQueryRelationProvider.makeTable(tableId, StandardTableDefinition.newBuilder().setSchema(Schema.of(Field.of("name", StandardSQLTypeName.STRING), Field.of("age", StandardSQLTypeName.INT64))).setNumBytes(100L).setNumRows(1000L).build()));
Dataset<Row> df = spark.read().format(MockBigQueryRelationProvider.class.getName()).option("gcpAccessToken", "not a real access token").option("parentProject", "not a project").load("testproject.dataset.MyTable");
String outputDir = writeDir.resolve("testBigQueryRead").toAbsolutePath().toUri().getPath();
df.write().csv("file://" + outputDir);
// wait for event processing to complete
StaticExecutionContextFactory.waitForExecutionEnd();
ArgumentCaptor<OpenLineage.RunEvent> lineageEvent = ArgumentCaptor.forClass(OpenLineage.RunEvent.class);
Mockito.verify(SparkAgentTestExtension.OPEN_LINEAGE_SPARK_CONTEXT, times(4)).emit(lineageEvent.capture());
List<OpenLineage.RunEvent> events = lineageEvent.getAllValues();
assertThat(events.get(2).getRun().getFacets().getAdditionalProperties()).hasEntrySatisfying(TestOpenLineageEventHandlerFactory.TEST_FACET_KEY, facet -> assertThat(facet).isInstanceOf(DefaultRunFacet.class).extracting("additionalProperties", InstanceOfAssertFactories.map(String.class, Object.class)).containsKey("message"));
List<OpenLineage.InputDataset> inputs = events.get(2).getInputs();
assertEquals(1, inputs.size());
assertEquals("bigquery", inputs.get(0).getNamespace());
assertEquals(BigQueryUtil.friendlyTableName(tableId), inputs.get(0).getName());
List<OpenLineage.OutputDataset> outputs = events.get(2).getOutputs();
assertEquals(1, outputs.size());
OpenLineage.OutputDataset output = outputs.get(0);
assertEquals("file", output.getNamespace());
assertEquals(outputDir, output.getName());
OpenLineage.SchemaDatasetFacet schemaDatasetFacet = PlanUtils.schemaFacet(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI), tableSchema);
assertThat(output.getFacets().getSchema()).usingRecursiveComparison().isEqualTo(schemaDatasetFacet);
assertNotNull(output.getFacets().getAdditionalProperties());
assertThat(output.getOutputFacets().getOutputStatistics()).isNotNull();
}
use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.
the class LogicalRDDVisitorTest method testApply.
@Test
public void testApply(@TempDir Path tmpDir) {
SparkSession session = SparkSession.builder().master("local").getOrCreate();
LogicalRDDVisitor visitor = new LogicalRDDVisitor(SparkAgentTestExtension.newContext(session), DatasetFactory.output(new OpenLineage(OpenLineageClient.OPEN_LINEAGE_CLIENT_URI)));
StructType schema = new StructType(new StructField[] { new StructField("anInt", IntegerType$.MODULE$, false, new Metadata(new HashMap<>())), new StructField("aString", StringType$.MODULE$, false, new Metadata(new HashMap<>())) });
jobConf = new JobConf();
FileInputFormat.addInputPath(jobConf, new org.apache.hadoop.fs.Path("file://" + tmpDir));
RDD<InternalRow> hadoopRdd = new HadoopRDD<>(session.sparkContext(), jobConf, TextInputFormat.class, LongWritable.class, Text.class, 1).toJavaRDD().map(t -> (InternalRow) new GenericInternalRow(new Object[] { t._2.toString() })).rdd();
LogicalRDD logicalRDD = new LogicalRDD(ScalaConversionUtils.fromSeq(schema.toAttributes()).stream().map(AttributeReference::toAttribute).collect(ScalaConversionUtils.toSeq()), hadoopRdd, SinglePartition$.MODULE$, Seq$.MODULE$.<SortOrder>empty(), false, session);
assertThat(visitor.isDefinedAt(logicalRDD)).isTrue();
List<OpenLineage.Dataset> datasets = visitor.apply(logicalRDD);
assertThat(datasets).singleElement().hasFieldOrPropertyWithValue("name", tmpDir.toString()).hasFieldOrPropertyWithValue("namespace", "file");
}
use of io.openlineage.client.OpenLineage in project OpenLineage by OpenLineage.
the class LogicalRelationDatasetBuilderTest method testApplyForHadoopFsRelation.
@Test
void testApplyForHadoopFsRelation() {
HadoopFsRelation hadoopFsRelation = mock(HadoopFsRelation.class);
LogicalRelation logicalRelation = mock(LogicalRelation.class);
Configuration hadoopConfig = mock(Configuration.class);
SparkContext sparkContext = mock(SparkContext.class);
FileIndex fileIndex = mock(FileIndex.class);
OpenLineage openLineage = mock(OpenLineage.class);
SessionState sessionState = mock(SessionState.class);
Path p1 = new Path("/tmp/path1");
Path p2 = new Path("/tmp/path2");
when(logicalRelation.relation()).thenReturn(hadoopFsRelation);
when(openLineageContext.getSparkContext()).thenReturn(sparkContext);
when(openLineageContext.getSparkSession()).thenReturn(Optional.of(session));
when(openLineageContext.getOpenLineage()).thenReturn(openLineage);
when(openLineage.newDatasetFacetsBuilder()).thenReturn(new OpenLineage.DatasetFacetsBuilder());
when(session.sessionState()).thenReturn(sessionState);
when(sessionState.newHadoopConfWithOptions(any())).thenReturn(hadoopConfig);
when(hadoopFsRelation.location()).thenReturn(fileIndex);
when(fileIndex.rootPaths()).thenReturn(scala.collection.JavaConverters.collectionAsScalaIterableConverter(Arrays.asList(p1, p2)).asScala().toSeq());
try (MockedStatic mocked = mockStatic(PlanUtils.class)) {
when(PlanUtils.getDirectoryPath(p1, hadoopConfig)).thenReturn(new Path("/tmp"));
when(PlanUtils.getDirectoryPath(p2, hadoopConfig)).thenReturn(new Path("/tmp"));
List<OpenLineage.Dataset> datasets = builder.apply(logicalRelation);
assertEquals(1, datasets.size());
OpenLineage.Dataset ds = datasets.get(0);
assertEquals("/tmp", ds.getName());
}
}
Aggregations