Search in sources :

Example 1 with SaveIntoDataSourceCommand

use of org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand in project OpenLineage by OpenLineage.

the class SaveIntoDataSourceCommandVisitor method apply.

@Override
public List<OpenLineage.OutputDataset> apply(SparkListenerEvent event) {
    BaseRelation relation;
    SaveIntoDataSourceCommand command = (SaveIntoDataSourceCommand) context.getQueryExecution().get().optimizedPlan();
    // as other impls of CreatableRelationProvider may not be able to be handled in the generic way.
    if (KafkaRelationVisitor.isKafkaSource(command.dataSource())) {
        return KafkaRelationVisitor.createKafkaDatasets(outputDataset(), command.dataSource(), command.options(), command.mode(), command.schema());
    }
    if (command.dataSource().getClass().getName().contains("DeltaDataSource")) {
        if (command.options().contains("path")) {
            URI uri = URI.create(command.options().get("path").get());
            return Collections.singletonList(outputDataset().getDataset(PathUtils.fromURI(uri, "file"), command.schema()));
        }
    }
    SQLContext sqlContext = context.getSparkSession().get().sqlContext();
    try {
        if (command.dataSource() instanceof RelationProvider) {
            RelationProvider p = (RelationProvider) command.dataSource();
            relation = p.createRelation(sqlContext, command.options());
        } else {
            SchemaRelationProvider p = (SchemaRelationProvider) command.dataSource();
            relation = p.createRelation(sqlContext, command.options(), command.schema());
        }
    } catch (Exception ex) {
        // Bad detection of errors in scala
        if (ex instanceof SQLException) {
            // This can happen on SparkListenerSQLExecutionStart for example for sqlite, when database
            // does not exist yet - it will be created as command execution
            // Still, we can just ignore it on start, because it will work on end
            // see SparkReadWriteIntegTest.testReadFromFileWriteToJdbc
            log.warn("Can't create relation: ", ex);
            return Collections.emptyList();
        }
        throw ex;
    }
    LogicalRelation logicalRelation = new LogicalRelation(relation, relation.schema().toAttributes(), Option.empty(), command.isStreaming());
    return delegate(context.getOutputDatasetQueryPlanVisitors(), context.getOutputDatasetBuilders(), event).applyOrElse(logicalRelation, ScalaConversionUtils.toScalaFn((lp) -> Collections.<OutputDataset>emptyList())).stream().map(ds -> {
        Builder<String, OpenLineage.DatasetFacet> facetsMap = ImmutableMap.<String, OpenLineage.DatasetFacet>builder();
        if (ds.getFacets().getAdditionalProperties() != null) {
            facetsMap.putAll(ds.getFacets().getAdditionalProperties());
        }
        ds.getFacets().getAdditionalProperties().putAll(facetsMap.build());
        if (SaveMode.Overwrite == command.mode()) {
            // rebuild whole dataset with a LifecycleStateChange facet added
            OpenLineage.DatasetFacets facets = context.getOpenLineage().newDatasetFacets(ds.getFacets().getDocumentation(), ds.getFacets().getDataSource(), ds.getFacets().getVersion(), ds.getFacets().getSchema(), context.getOpenLineage().newLifecycleStateChangeDatasetFacet(OpenLineage.LifecycleStateChangeDatasetFacet.LifecycleStateChange.OVERWRITE, null));
            OpenLineage.OutputDataset newDs = context.getOpenLineage().newOutputDataset(ds.getNamespace(), ds.getName(), facets, ds.getOutputFacets());
            return newDs;
        }
        return ds;
    }).collect(Collectors.toList());
}
Also used : SaveMode(org.apache.spark.sql.SaveMode) SaveIntoDataSourceCommand(org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) BaseRelation(org.apache.spark.sql.sources.BaseRelation) ImmutableMap(com.google.common.collect.ImmutableMap) OpenLineageContext(io.openlineage.spark.api.OpenLineageContext) SQLContext(org.apache.spark.sql.SQLContext) RelationProvider(org.apache.spark.sql.sources.RelationProvider) PathUtils(io.openlineage.spark.agent.util.PathUtils) AbstractQueryPlanDatasetBuilder(io.openlineage.spark.api.AbstractQueryPlanDatasetBuilder) LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) Option(scala.Option) Collectors(java.util.stream.Collectors) SparkListenerEvent(org.apache.spark.scheduler.SparkListenerEvent) Builder(com.google.common.collect.ImmutableMap.Builder) SQLException(java.sql.SQLException) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) SchemaRelationProvider(org.apache.spark.sql.sources.SchemaRelationProvider) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) URI(java.net.URI) OpenLineage(io.openlineage.client.OpenLineage) Collections(java.util.Collections) SQLException(java.sql.SQLException) RelationProvider(org.apache.spark.sql.sources.RelationProvider) SchemaRelationProvider(org.apache.spark.sql.sources.SchemaRelationProvider) AbstractQueryPlanDatasetBuilder(io.openlineage.spark.api.AbstractQueryPlanDatasetBuilder) Builder(com.google.common.collect.ImmutableMap.Builder) BaseRelation(org.apache.spark.sql.sources.BaseRelation) URI(java.net.URI) SQLException(java.sql.SQLException) SchemaRelationProvider(org.apache.spark.sql.sources.SchemaRelationProvider) LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) SaveIntoDataSourceCommand(org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand) OpenLineage(io.openlineage.client.OpenLineage) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) SQLContext(org.apache.spark.sql.SQLContext)

Example 2 with SaveIntoDataSourceCommand

use of org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand in project OpenLineage by OpenLineage.

the class BigQueryNodeVisitor method bigQuerySupplier.

private Optional<Supplier<BigQueryRelation>> bigQuerySupplier(LogicalPlan plan) {
    // SaveIntoDataSourceCommand is a special case because it references a CreatableRelationProvider
    // Every other write instance references a LogicalRelation(BigQueryRelation, _, _, _)
    SQLContext sqlContext = context.getSparkSession().get().sqlContext();
    if (plan instanceof SaveIntoDataSourceCommand) {
        SaveIntoDataSourceCommand saveCommand = (SaveIntoDataSourceCommand) plan;
        CreatableRelationProvider relationProvider = saveCommand.dataSource();
        if (relationProvider instanceof BigQueryRelationProvider) {
            return Optional.of(() -> (BigQueryRelation) ((BigQueryRelationProvider) relationProvider).createRelation(sqlContext, saveCommand.options(), saveCommand.schema()));
        }
    } else {
        if (plan instanceof LogicalRelation && ((LogicalRelation) plan).relation() instanceof BigQueryRelation) {
            return Optional.of(() -> (BigQueryRelation) ((LogicalRelation) plan).relation());
        }
    }
    return Optional.empty();
}
Also used : LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) BigQueryRelation(com.google.cloud.spark.bigquery.BigQueryRelation) SaveIntoDataSourceCommand(org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand) CreatableRelationProvider(org.apache.spark.sql.sources.CreatableRelationProvider) BigQueryRelationProvider(com.google.cloud.spark.bigquery.BigQueryRelationProvider) SQLContext(org.apache.spark.sql.SQLContext)

Aggregations

SQLContext (org.apache.spark.sql.SQLContext)2 LogicalRelation (org.apache.spark.sql.execution.datasources.LogicalRelation)2 SaveIntoDataSourceCommand (org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand)2 BigQueryRelation (com.google.cloud.spark.bigquery.BigQueryRelation)1 BigQueryRelationProvider (com.google.cloud.spark.bigquery.BigQueryRelationProvider)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 Builder (com.google.common.collect.ImmutableMap.Builder)1 OpenLineage (io.openlineage.client.OpenLineage)1 OutputDataset (io.openlineage.client.OpenLineage.OutputDataset)1 PathUtils (io.openlineage.spark.agent.util.PathUtils)1 ScalaConversionUtils (io.openlineage.spark.agent.util.ScalaConversionUtils)1 AbstractQueryPlanDatasetBuilder (io.openlineage.spark.api.AbstractQueryPlanDatasetBuilder)1 OpenLineageContext (io.openlineage.spark.api.OpenLineageContext)1 URI (java.net.URI)1 SQLException (java.sql.SQLException)1 Collections (java.util.Collections)1 List (java.util.List)1 Collectors (java.util.stream.Collectors)1 Slf4j (lombok.extern.slf4j.Slf4j)1 SparkListenerEvent (org.apache.spark.scheduler.SparkListenerEvent)1