Search in sources :

Example 1 with SparkListenerEvent

use of org.apache.spark.scheduler.SparkListenerEvent in project OpenLineage by OpenLineage.

the class SaveIntoDataSourceCommandVisitor method apply.

@Override
public List<OpenLineage.OutputDataset> apply(SparkListenerEvent event) {
    BaseRelation relation;
    SaveIntoDataSourceCommand command = (SaveIntoDataSourceCommand) context.getQueryExecution().get().optimizedPlan();
    // as other impls of CreatableRelationProvider may not be able to be handled in the generic way.
    if (KafkaRelationVisitor.isKafkaSource(command.dataSource())) {
        return KafkaRelationVisitor.createKafkaDatasets(outputDataset(), command.dataSource(), command.options(), command.mode(), command.schema());
    }
    if (command.dataSource().getClass().getName().contains("DeltaDataSource")) {
        if (command.options().contains("path")) {
            URI uri = URI.create(command.options().get("path").get());
            return Collections.singletonList(outputDataset().getDataset(PathUtils.fromURI(uri, "file"), command.schema()));
        }
    }
    SQLContext sqlContext = context.getSparkSession().get().sqlContext();
    try {
        if (command.dataSource() instanceof RelationProvider) {
            RelationProvider p = (RelationProvider) command.dataSource();
            relation = p.createRelation(sqlContext, command.options());
        } else {
            SchemaRelationProvider p = (SchemaRelationProvider) command.dataSource();
            relation = p.createRelation(sqlContext, command.options(), command.schema());
        }
    } catch (Exception ex) {
        // Bad detection of errors in scala
        if (ex instanceof SQLException) {
            // This can happen on SparkListenerSQLExecutionStart for example for sqlite, when database
            // does not exist yet - it will be created as command execution
            // Still, we can just ignore it on start, because it will work on end
            // see SparkReadWriteIntegTest.testReadFromFileWriteToJdbc
            log.warn("Can't create relation: ", ex);
            return Collections.emptyList();
        }
        throw ex;
    }
    LogicalRelation logicalRelation = new LogicalRelation(relation, relation.schema().toAttributes(), Option.empty(), command.isStreaming());
    return delegate(context.getOutputDatasetQueryPlanVisitors(), context.getOutputDatasetBuilders(), event).applyOrElse(logicalRelation, ScalaConversionUtils.toScalaFn((lp) -> Collections.<OutputDataset>emptyList())).stream().map(ds -> {
        Builder<String, OpenLineage.DatasetFacet> facetsMap = ImmutableMap.<String, OpenLineage.DatasetFacet>builder();
        if (ds.getFacets().getAdditionalProperties() != null) {
            facetsMap.putAll(ds.getFacets().getAdditionalProperties());
        }
        ds.getFacets().getAdditionalProperties().putAll(facetsMap.build());
        if (SaveMode.Overwrite == command.mode()) {
            // rebuild whole dataset with a LifecycleStateChange facet added
            OpenLineage.DatasetFacets facets = context.getOpenLineage().newDatasetFacets(ds.getFacets().getDocumentation(), ds.getFacets().getDataSource(), ds.getFacets().getVersion(), ds.getFacets().getSchema(), context.getOpenLineage().newLifecycleStateChangeDatasetFacet(OpenLineage.LifecycleStateChangeDatasetFacet.LifecycleStateChange.OVERWRITE, null));
            OpenLineage.OutputDataset newDs = context.getOpenLineage().newOutputDataset(ds.getNamespace(), ds.getName(), facets, ds.getOutputFacets());
            return newDs;
        }
        return ds;
    }).collect(Collectors.toList());
}
Also used : SaveMode(org.apache.spark.sql.SaveMode) SaveIntoDataSourceCommand(org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) BaseRelation(org.apache.spark.sql.sources.BaseRelation) ImmutableMap(com.google.common.collect.ImmutableMap) OpenLineageContext(io.openlineage.spark.api.OpenLineageContext) SQLContext(org.apache.spark.sql.SQLContext) RelationProvider(org.apache.spark.sql.sources.RelationProvider) PathUtils(io.openlineage.spark.agent.util.PathUtils) AbstractQueryPlanDatasetBuilder(io.openlineage.spark.api.AbstractQueryPlanDatasetBuilder) LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) Option(scala.Option) Collectors(java.util.stream.Collectors) SparkListenerEvent(org.apache.spark.scheduler.SparkListenerEvent) Builder(com.google.common.collect.ImmutableMap.Builder) SQLException(java.sql.SQLException) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) SchemaRelationProvider(org.apache.spark.sql.sources.SchemaRelationProvider) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) URI(java.net.URI) OpenLineage(io.openlineage.client.OpenLineage) Collections(java.util.Collections) SQLException(java.sql.SQLException) RelationProvider(org.apache.spark.sql.sources.RelationProvider) SchemaRelationProvider(org.apache.spark.sql.sources.SchemaRelationProvider) AbstractQueryPlanDatasetBuilder(io.openlineage.spark.api.AbstractQueryPlanDatasetBuilder) Builder(com.google.common.collect.ImmutableMap.Builder) BaseRelation(org.apache.spark.sql.sources.BaseRelation) URI(java.net.URI) SQLException(java.sql.SQLException) SchemaRelationProvider(org.apache.spark.sql.sources.SchemaRelationProvider) LogicalRelation(org.apache.spark.sql.execution.datasources.LogicalRelation) SaveIntoDataSourceCommand(org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand) OpenLineage(io.openlineage.client.OpenLineage) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) SQLContext(org.apache.spark.sql.SQLContext)

Example 2 with SparkListenerEvent

use of org.apache.spark.scheduler.SparkListenerEvent in project OpenLineage by OpenLineage.

the class AppendDataDatasetBuilder method apply.

@Override
public List<OpenLineage.OutputDataset> apply(SparkListenerEvent event) {
    // Needs to cast to logical plan despite IntelliJ claiming otherwise.
    AppendData appendData = (AppendData) context.getQueryExecution().get().optimizedPlan();
    LogicalPlan logicalPlan = (LogicalPlan) (appendData).table();
    return delegate(context.getOutputDatasetQueryPlanVisitors(), context.getOutputDatasetBuilders(), event).applyOrElse(logicalPlan, ScalaConversionUtils.toScalaFn((lp) -> Collections.<OpenLineage.OutputDataset>emptyList())).stream().collect(Collectors.toList());
}
Also used : AbstractQueryPlanOutputDatasetBuilder(io.openlineage.spark.api.AbstractQueryPlanOutputDatasetBuilder) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan) DatasetFactory(io.openlineage.spark.api.DatasetFactory) OpenLineageContext(io.openlineage.spark.api.OpenLineageContext) AppendData(org.apache.spark.sql.catalyst.plans.logical.AppendData) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) OpenLineage(io.openlineage.client.OpenLineage) Collections(java.util.Collections) Collectors(java.util.stream.Collectors) SparkListenerEvent(org.apache.spark.scheduler.SparkListenerEvent) AppendData(org.apache.spark.sql.catalyst.plans.logical.AppendData) LogicalPlan(org.apache.spark.sql.catalyst.plans.logical.LogicalPlan)

Example 3 with SparkListenerEvent

use of org.apache.spark.scheduler.SparkListenerEvent in project kylo by Teradata.

the class SparkDataSetProviderV1 method onJobEnd.

@Override
protected void onJobEnd(@Nonnull final Function1<SparkListenerJobEnd, Unit> function, @Nonnull final KyloCatalogClient<DataFrame> client) {
    final SparkListener listener = new JavaSparkListener() {

        @Override
        public void onJobEnd(@Nonnull final SparkListenerJobEnd jobEnd) {
            function.apply(jobEnd);
        }

        // method required for CDH 5.8+
        @SuppressWarnings("unused")
        public void onOtherEvent(@Nonnull final SparkListenerEvent event) {
        // ignored
        }
    };
    ((KyloCatalogClientV1) client).getSQLContext().sparkContext().addSparkListener(listener);
}
Also used : JavaSparkListener(org.apache.spark.JavaSparkListener) SparkListener(org.apache.spark.scheduler.SparkListener) JavaSparkListener(org.apache.spark.JavaSparkListener) Nonnull(javax.annotation.Nonnull) SparkListenerJobEnd(org.apache.spark.scheduler.SparkListenerJobEnd) KyloCatalogClientV1(com.thinkbiganalytics.kylo.catalog.spark.KyloCatalogClientV1) SparkListenerEvent(org.apache.spark.scheduler.SparkListenerEvent)

Aggregations

SparkListenerEvent (org.apache.spark.scheduler.SparkListenerEvent)3 OpenLineage (io.openlineage.client.OpenLineage)2 ScalaConversionUtils (io.openlineage.spark.agent.util.ScalaConversionUtils)2 OpenLineageContext (io.openlineage.spark.api.OpenLineageContext)2 Collections (java.util.Collections)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 Slf4j (lombok.extern.slf4j.Slf4j)2 LogicalPlan (org.apache.spark.sql.catalyst.plans.logical.LogicalPlan)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 Builder (com.google.common.collect.ImmutableMap.Builder)1 KyloCatalogClientV1 (com.thinkbiganalytics.kylo.catalog.spark.KyloCatalogClientV1)1 OutputDataset (io.openlineage.client.OpenLineage.OutputDataset)1 PathUtils (io.openlineage.spark.agent.util.PathUtils)1 AbstractQueryPlanDatasetBuilder (io.openlineage.spark.api.AbstractQueryPlanDatasetBuilder)1 AbstractQueryPlanOutputDatasetBuilder (io.openlineage.spark.api.AbstractQueryPlanOutputDatasetBuilder)1 DatasetFactory (io.openlineage.spark.api.DatasetFactory)1 URI (java.net.URI)1 SQLException (java.sql.SQLException)1 Nonnull (javax.annotation.Nonnull)1