use of io.openlineage.client.OpenLineage.OutputDataset in project OpenLineage by OpenLineage.
the class SaveIntoDataSourceCommandVisitor method apply.
@Override
public List<OpenLineage.OutputDataset> apply(SparkListenerEvent event) {
BaseRelation relation;
SaveIntoDataSourceCommand command = (SaveIntoDataSourceCommand) context.getQueryExecution().get().optimizedPlan();
// as other impls of CreatableRelationProvider may not be able to be handled in the generic way.
if (KafkaRelationVisitor.isKafkaSource(command.dataSource())) {
return KafkaRelationVisitor.createKafkaDatasets(outputDataset(), command.dataSource(), command.options(), command.mode(), command.schema());
}
if (command.dataSource().getClass().getName().contains("DeltaDataSource")) {
if (command.options().contains("path")) {
URI uri = URI.create(command.options().get("path").get());
return Collections.singletonList(outputDataset().getDataset(PathUtils.fromURI(uri, "file"), command.schema()));
}
}
SQLContext sqlContext = context.getSparkSession().get().sqlContext();
try {
if (command.dataSource() instanceof RelationProvider) {
RelationProvider p = (RelationProvider) command.dataSource();
relation = p.createRelation(sqlContext, command.options());
} else {
SchemaRelationProvider p = (SchemaRelationProvider) command.dataSource();
relation = p.createRelation(sqlContext, command.options(), command.schema());
}
} catch (Exception ex) {
// Bad detection of errors in scala
if (ex instanceof SQLException) {
// This can happen on SparkListenerSQLExecutionStart for example for sqlite, when database
// does not exist yet - it will be created as command execution
// Still, we can just ignore it on start, because it will work on end
// see SparkReadWriteIntegTest.testReadFromFileWriteToJdbc
log.warn("Can't create relation: ", ex);
return Collections.emptyList();
}
throw ex;
}
LogicalRelation logicalRelation = new LogicalRelation(relation, relation.schema().toAttributes(), Option.empty(), command.isStreaming());
return delegate(context.getOutputDatasetQueryPlanVisitors(), context.getOutputDatasetBuilders(), event).applyOrElse(logicalRelation, ScalaConversionUtils.toScalaFn((lp) -> Collections.<OutputDataset>emptyList())).stream().map(ds -> {
Builder<String, OpenLineage.DatasetFacet> facetsMap = ImmutableMap.<String, OpenLineage.DatasetFacet>builder();
if (ds.getFacets().getAdditionalProperties() != null) {
facetsMap.putAll(ds.getFacets().getAdditionalProperties());
}
ds.getFacets().getAdditionalProperties().putAll(facetsMap.build());
if (SaveMode.Overwrite == command.mode()) {
// rebuild whole dataset with a LifecycleStateChange facet added
OpenLineage.DatasetFacets facets = context.getOpenLineage().newDatasetFacets(ds.getFacets().getDocumentation(), ds.getFacets().getDataSource(), ds.getFacets().getVersion(), ds.getFacets().getSchema(), context.getOpenLineage().newLifecycleStateChangeDatasetFacet(OpenLineage.LifecycleStateChangeDatasetFacet.LifecycleStateChange.OVERWRITE, null));
OpenLineage.OutputDataset newDs = context.getOpenLineage().newOutputDataset(ds.getNamespace(), ds.getName(), facets, ds.getOutputFacets());
return newDs;
}
return ds;
}).collect(Collectors.toList());
}
use of io.openlineage.client.OpenLineage.OutputDataset in project OpenLineage by OpenLineage.
the class OpenLineageTest method jsonSerialization.
@Test
public void jsonSerialization() throws JsonProcessingException {
ZonedDateTime now = ZonedDateTime.now(ZoneId.of("UTC"));
URI producer = URI.create("producer");
OpenLineage ol = new OpenLineage(producer);
UUID runId = UUID.randomUUID();
RunFacets runFacets = ol.newRunFacetsBuilder().nominalTime(ol.newNominalTimeRunFacet(now, now)).build();
Run run = ol.newRun(runId, runFacets);
String name = "jobName";
String namespace = "namespace";
JobFacets jobFacets = ol.newJobFacetsBuilder().build();
Job job = ol.newJob(namespace, name, jobFacets);
List<InputDataset> inputs = Arrays.asList(ol.newInputDataset("ins", "input", null, null));
List<OutputDataset> outputs = Arrays.asList(ol.newOutputDataset("ons", "output", null, null));
RunEvent runStateUpdate = ol.newRunEvent(OpenLineage.RunEvent.EventType.START, now, run, job, inputs, outputs);
String json = mapper.writeValueAsString(runStateUpdate);
RunEvent read = mapper.readValue(json, RunEvent.class);
assertEquals(producer, read.getProducer());
assertEquals(runId, read.getRun().getRunId());
assertEquals(name, read.getJob().getName());
assertEquals(namespace, read.getJob().getNamespace());
assertEquals(runStateUpdate.getEventType(), read.getEventType());
assertEquals(runStateUpdate.getEventTime(), read.getEventTime());
assertEquals(1, runStateUpdate.getInputs().size());
NominalTimeRunFacet nominalTime = runStateUpdate.getRun().getFacets().getNominalTime();
assertEquals(now, nominalTime.getNominalStartTime());
assertEquals(now, nominalTime.getNominalEndTime());
InputDataset inputDataset = runStateUpdate.getInputs().get(0);
assertEquals("ins", inputDataset.getNamespace());
assertEquals("input", inputDataset.getName());
assertEquals(1, runStateUpdate.getOutputs().size());
OutputDataset outputDataset = runStateUpdate.getOutputs().get(0);
assertEquals("ons", outputDataset.getNamespace());
assertEquals("output", outputDataset.getName());
assertEquals(roundTrip(json), roundTrip(mapper.writeValueAsString(read)));
}
use of io.openlineage.client.OpenLineage.OutputDataset in project OpenLineage by OpenLineage.
the class OpenLineageTest method factory.
@Test
public void factory() throws JsonProcessingException {
ZonedDateTime now = ZonedDateTime.now(ZoneId.of("UTC"));
URI producer = URI.create("producer");
OpenLineage ol = new OpenLineage(producer);
UUID runId = UUID.randomUUID();
RunFacets runFacets = ol.newRunFacetsBuilder().nominalTime(ol.newNominalTimeRunFacetBuilder().nominalStartTime(now).nominalEndTime(now).build()).build();
Run run = ol.newRunBuilder().runId(runId).facets(runFacets).build();
String name = "jobName";
String namespace = "namespace";
JobFacets jobFacets = ol.newJobFacetsBuilder().build();
Job job = ol.newJobBuilder().namespace(namespace).name(name).facets(jobFacets).build();
List<InputDataset> inputs = Arrays.asList(ol.newInputDatasetBuilder().namespace("ins").name("input").facets(ol.newDatasetFacetsBuilder().version(ol.newDatasetVersionDatasetFacet("input-version")).build()).inputFacets(ol.newInputDatasetInputFacetsBuilder().dataQualityMetrics(ol.newDataQualityMetricsInputDatasetFacetBuilder().rowCount(10L).bytes(20L).columnMetrics(ol.newDataQualityMetricsInputDatasetFacetColumnMetricsBuilder().put("mycol", ol.newDataQualityMetricsInputDatasetFacetColumnMetricsAdditionalBuilder().count(10D).distinctCount(10L).max(30D).min(5D).nullCount(1L).sum(3000D).quantiles(ol.newDataQualityMetricsInputDatasetFacetColumnMetricsAdditionalQuantilesBuilder().put("25", 52D).build()).build()).build()).build()).build()).build());
List<OutputDataset> outputs = Arrays.asList(ol.newOutputDatasetBuilder().namespace("ons").name("output").facets(ol.newDatasetFacetsBuilder().version(ol.newDatasetVersionDatasetFacet("output-version")).build()).outputFacets(ol.newOutputDatasetOutputFacetsBuilder().outputStatistics(ol.newOutputStatisticsOutputDatasetFacet(10L, 20L)).build()).build());
RunEvent runStateUpdate = ol.newRunEventBuilder().eventType(OpenLineage.RunEvent.EventType.START).eventTime(now).run(run).job(job).inputs(inputs).outputs(outputs).build();
ObjectMapper mapper = new ObjectMapper();
mapper.registerModule(new JavaTimeModule());
mapper.setSerializationInclusion(Include.NON_NULL);
mapper.disable(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS);
mapper.configure(SerializationFeature.INDENT_OUTPUT, true);
String json = mapper.writeValueAsString(runStateUpdate);
{
RunEvent read = mapper.readValue(json, RunEvent.class);
assertEquals(producer, read.getProducer());
assertEquals(runId, read.getRun().getRunId());
assertEquals(name, read.getJob().getName());
assertEquals(namespace, read.getJob().getNamespace());
assertEquals(runStateUpdate.getEventType(), read.getEventType());
assertEquals(runStateUpdate.getEventTime(), read.getEventTime());
assertEquals(1, runStateUpdate.getInputs().size());
InputDataset inputDataset = runStateUpdate.getInputs().get(0);
assertEquals("ins", inputDataset.getNamespace());
assertEquals("input", inputDataset.getName());
assertEquals("input-version", inputDataset.getFacets().getVersion().getDatasetVersion());
DataQualityMetricsInputDatasetFacet dq = inputDataset.getInputFacets().getDataQualityMetrics();
assertEquals((Long) 10L, dq.getRowCount());
assertEquals((Long) 20L, dq.getBytes());
DataQualityMetricsInputDatasetFacetColumnMetricsAdditional colMetrics = dq.getColumnMetrics().getAdditionalProperties().get("mycol");
assertEquals((Double) 10D, colMetrics.getCount());
assertEquals((Long) 10L, colMetrics.getDistinctCount());
assertEquals((Double) 30D, colMetrics.getMax());
assertEquals((Double) 5D, colMetrics.getMin());
assertEquals((Long) 1L, colMetrics.getNullCount());
assertEquals((Double) 3000D, colMetrics.getSum());
assertEquals((Double) 52D, colMetrics.getQuantiles().getAdditionalProperties().get("25"));
assertEquals(1, runStateUpdate.getOutputs().size());
OutputDataset outputDataset = runStateUpdate.getOutputs().get(0);
assertEquals("ons", outputDataset.getNamespace());
assertEquals("output", outputDataset.getName());
assertEquals("output-version", outputDataset.getFacets().getVersion().getDatasetVersion());
assertEquals(roundTrip(json), roundTrip(mapper.writeValueAsString(read)));
assertEquals((Long) 10L, outputDataset.getOutputFacets().getOutputStatistics().getRowCount());
assertEquals((Long) 20L, outputDataset.getOutputFacets().getOutputStatistics().getSize());
assertEquals(json, mapper.writeValueAsString(read));
}
{
io.openlineage.server.OpenLineage.RunEvent readServer = mapper.readValue(json, io.openlineage.server.OpenLineage.RunEvent.class);
assertEquals(producer, readServer.getProducer());
assertEquals(runId, readServer.getRun().getRunId());
assertEquals(name, readServer.getJob().getName());
assertEquals(namespace, readServer.getJob().getNamespace());
assertEquals(runStateUpdate.getEventType().name(), readServer.getEventType().name());
assertEquals(runStateUpdate.getEventTime(), readServer.getEventTime());
assertEquals(json, mapper.writeValueAsString(readServer));
}
}
use of io.openlineage.client.OpenLineage.OutputDataset in project OpenLineage by OpenLineage.
the class TruncateTableCommandVisitor method apply.
@Override
public List<OutputDataset> apply(LogicalPlan x) {
TruncateTableCommand command = (TruncateTableCommand) x;
Optional<CatalogTable> tableOpt = catalogTableFor(command.tableName());
if (tableOpt.isPresent()) {
CatalogTable table = tableOpt.get();
DatasetIdentifier datasetIdentifier = PathUtils.fromCatalogTable(table);
DatasetFactory<OutputDataset> datasetFactory = outputDataset();
return Collections.singletonList(datasetFactory.getDataset(datasetIdentifier, new OpenLineage.DatasetFacetsBuilder().schema(null).dataSource(PlanUtils.datasourceFacet(context.getOpenLineage(), datasetIdentifier.getNamespace())).lifecycleStateChange(context.getOpenLineage().newLifecycleStateChangeDatasetFacet(OpenLineage.LifecycleStateChangeDatasetFacet.LifecycleStateChange.TRUNCATE, null)).build()));
} else {
// table does not exist, cannot prepare an event
return Collections.emptyList();
}
}
use of io.openlineage.client.OpenLineage.OutputDataset in project OpenLineage by OpenLineage.
the class InternalEventHandlerFactory method createOutputDatasetBuilder.
@Override
public Collection<PartialFunction<Object, List<OutputDataset>>> createOutputDatasetBuilder(OpenLineageContext context) {
ImmutableList outputDatasetBuilders = ImmutableList.<PartialFunction<Object, List<OutputDataset>>>builder().addAll(generate(eventHandlerFactories, factory -> factory.createOutputDatasetBuilder(context))).addAll(DatasetBuilderFactoryProvider.getInstance().getOutputBuilders(context)).build();
context.getOutputDatasetBuilders().addAll(outputDatasetBuilders);
return outputDatasetBuilders;
}
Aggregations