Search in sources :

Example 1 with DatasetIdentifier

use of io.openlineage.spark.agent.util.DatasetIdentifier in project OpenLineage by OpenLineage.

the class InsertIntoHadoopFsRelationVisitor method apply.

@Override
public List<OpenLineage.OutputDataset> apply(LogicalPlan x) {
    InsertIntoHadoopFsRelationCommand command = (InsertIntoHadoopFsRelationCommand) x;
    DatasetIdentifier di = PathUtils.fromURI(command.outputPath().toUri(), "file");
    OpenLineage.OutputDataset outputDataset;
    if (SaveMode.Overwrite == command.mode()) {
        outputDataset = outputDataset().getDataset(di, command.query().schema(), OpenLineage.LifecycleStateChangeDatasetFacet.LifecycleStateChange.OVERWRITE);
    } else {
        outputDataset = outputDataset().getDataset(di, command.query().schema());
    }
    return Collections.singletonList(outputDataset);
}
Also used : DatasetIdentifier(io.openlineage.spark.agent.util.DatasetIdentifier) OpenLineage(io.openlineage.client.OpenLineage) InsertIntoHadoopFsRelationCommand(org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand)

Example 2 with DatasetIdentifier

use of io.openlineage.spark.agent.util.DatasetIdentifier in project OpenLineage by OpenLineage.

the class TruncateTableCommandVisitor method apply.

@Override
public List<OutputDataset> apply(LogicalPlan x) {
    TruncateTableCommand command = (TruncateTableCommand) x;
    Optional<CatalogTable> tableOpt = catalogTableFor(command.tableName());
    if (tableOpt.isPresent()) {
        CatalogTable table = tableOpt.get();
        DatasetIdentifier datasetIdentifier = PathUtils.fromCatalogTable(table);
        DatasetFactory<OutputDataset> datasetFactory = outputDataset();
        return Collections.singletonList(datasetFactory.getDataset(datasetIdentifier, new OpenLineage.DatasetFacetsBuilder().schema(null).dataSource(PlanUtils.datasourceFacet(context.getOpenLineage(), datasetIdentifier.getNamespace())).lifecycleStateChange(context.getOpenLineage().newLifecycleStateChangeDatasetFacet(OpenLineage.LifecycleStateChangeDatasetFacet.LifecycleStateChange.TRUNCATE, null)).build()));
    } else {
        // table does not exist, cannot prepare an event
        return Collections.emptyList();
    }
}
Also used : TruncateTableCommand(org.apache.spark.sql.execution.command.TruncateTableCommand) DatasetIdentifier(io.openlineage.spark.agent.util.DatasetIdentifier) OutputDataset(io.openlineage.client.OpenLineage.OutputDataset) CatalogTable(org.apache.spark.sql.catalyst.catalog.CatalogTable)

Example 3 with DatasetIdentifier

use of io.openlineage.spark.agent.util.DatasetIdentifier in project OpenLineage by OpenLineage.

the class InsertIntoDataSourceDirVisitor method apply.

@Override
public List<OpenLineage.OutputDataset> apply(LogicalPlan x) {
    InsertIntoDataSourceDirCommand command = (InsertIntoDataSourceDirCommand) x;
    // URI is required by the InsertIntoDataSourceDirCommand
    DatasetIdentifier di = PathUtils.fromURI(command.storage().locationUri().get(), "file");
    OpenLineage.OutputDataset outputDataset;
    if (command.overwrite()) {
        outputDataset = outputDataset().getDataset(di, command.schema(), OpenLineage.LifecycleStateChangeDatasetFacet.LifecycleStateChange.OVERWRITE);
    } else {
        outputDataset = outputDataset().getDataset(di, command.schema());
    }
    return Collections.singletonList(outputDataset);
}
Also used : DatasetIdentifier(io.openlineage.spark.agent.util.DatasetIdentifier) OpenLineage(io.openlineage.client.OpenLineage) InsertIntoDataSourceDirCommand(org.apache.spark.sql.execution.command.InsertIntoDataSourceDirCommand)

Example 4 with DatasetIdentifier

use of io.openlineage.spark.agent.util.DatasetIdentifier in project OpenLineage by OpenLineage.

the class IcebergHandler method getDatasetIdentifier.

@Override
public DatasetIdentifier getDatasetIdentifier(SparkSession session, TableCatalog tableCatalog, Identifier identifier, Map<String, String> properties) {
    SparkCatalog sparkCatalog = (SparkCatalog) tableCatalog;
    String catalogName = sparkCatalog.name();
    String prefix = String.format("spark.sql.catalog.%s", catalogName);
    Map<String, String> conf = ScalaConversionUtils.<String, String>fromMap(session.conf().getAll());
    log.info(conf.toString());
    Map<String, String> catalogConf = conf.entrySet().stream().filter(x -> x.getKey().startsWith(prefix)).filter(x -> x.getKey().length() > prefix.length()).collect(Collectors.toMap(// handle dot after prefix
    x -> x.getKey().substring(prefix.length() + 1), Map.Entry::getValue));
    log.info(catalogConf.toString());
    if (catalogConf.isEmpty() || !catalogConf.containsKey("type")) {
        throw new UnsupportedCatalogException(catalogName);
    }
    log.info(catalogConf.get("type"));
    switch(catalogConf.get("type")) {
        case "hadoop":
            return getHadoopIdentifier(catalogConf, identifier.toString());
        case "hive":
            return getHiveIdentifier(session, catalogConf.get(CatalogProperties.URI), identifier.toString());
        default:
            throw new UnsupportedCatalogException(catalogConf.get("type"));
    }
}
Also used : SparkCatalog(org.apache.iceberg.spark.SparkCatalog) SneakyThrows(lombok.SneakyThrows) DatasetIdentifier(io.openlineage.spark.agent.util.DatasetIdentifier) PathUtils(io.openlineage.spark.agent.util.PathUtils) ScalaConversionUtils(io.openlineage.spark.agent.util.ScalaConversionUtils) Collectors(java.util.stream.Collectors) CatalogProperties(org.apache.iceberg.CatalogProperties) Slf4j(lombok.extern.slf4j.Slf4j) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) TableProviderFacet(io.openlineage.spark.agent.facets.TableProviderFacet) Map(java.util.Map) Optional(java.util.Optional) Path(org.apache.hadoop.fs.Path) URI(java.net.URI) Identifier(org.apache.spark.sql.connector.catalog.Identifier) SparkTable(org.apache.iceberg.spark.source.SparkTable) Nullable(javax.annotation.Nullable) SparkConfUtils(io.openlineage.spark.agent.util.SparkConfUtils) SparkSession(org.apache.spark.sql.SparkSession) SparkCatalog(org.apache.iceberg.spark.SparkCatalog) Map(java.util.Map)

Example 5 with DatasetIdentifier

use of io.openlineage.spark.agent.util.DatasetIdentifier in project OpenLineage by OpenLineage.

the class PlanUtils3 method fromDataSourceV2Relation.

public static <D extends OpenLineage.Dataset> List<D> fromDataSourceV2Relation(DatasetFactory<D> datasetFactory, OpenLineageContext context, DataSourceV2Relation relation, OpenLineage.DatasetFacetsBuilder datasetFacetsBuilder) {
    if (relation.identifier().isEmpty()) {
        throw new IllegalArgumentException("Couldn't find identifier for dataset in plan " + relation);
    }
    Identifier identifier = relation.identifier().get();
    if (relation.catalog().isEmpty() || !(relation.catalog().get() instanceof TableCatalog)) {
        throw new IllegalArgumentException("Couldn't find catalog for dataset in plan " + relation);
    }
    TableCatalog tableCatalog = (TableCatalog) relation.catalog().get();
    Map<String, String> tableProperties = relation.table().properties();
    Optional<DatasetIdentifier> di = PlanUtils3.getDatasetIdentifier(context, tableCatalog, identifier, tableProperties);
    if (!di.isPresent()) {
        return Collections.emptyList();
    }
    OpenLineage openLineage = context.getOpenLineage();
    datasetFacetsBuilder.schema(PlanUtils.schemaFacet(openLineage, relation.schema())).dataSource(PlanUtils.datasourceFacet(openLineage, di.get().getNamespace()));
    CatalogUtils3.getTableProviderFacet(tableCatalog, tableProperties).map(provider -> datasetFacetsBuilder.put("tableProvider", provider));
    return Collections.singletonList(datasetFactory.getDataset(di.get().getName(), di.get().getNamespace(), datasetFacetsBuilder.build()));
}
Also used : DatasetIdentifier(io.openlineage.spark.agent.util.DatasetIdentifier) Identifier(org.apache.spark.sql.connector.catalog.Identifier) DatasetIdentifier(io.openlineage.spark.agent.util.DatasetIdentifier) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) OpenLineage(io.openlineage.client.OpenLineage)

Aggregations

DatasetIdentifier (io.openlineage.spark.agent.util.DatasetIdentifier)26 OpenLineage (io.openlineage.client.OpenLineage)10 TableCatalog (org.apache.spark.sql.connector.catalog.TableCatalog)8 Test (org.junit.jupiter.api.Test)8 Identifier (org.apache.spark.sql.connector.catalog.Identifier)7 SneakyThrows (lombok.SneakyThrows)5 SparkSession (org.apache.spark.sql.SparkSession)5 CatalogTable (org.apache.spark.sql.catalyst.catalog.CatalogTable)5 PathUtils (io.openlineage.spark.agent.util.PathUtils)4 Optional (java.util.Optional)4 TableProviderFacet (io.openlineage.spark.agent.facets.TableProviderFacet)3 CatalogUtils3 (io.openlineage.spark3.agent.lifecycle.plan.catalog.CatalogUtils3)3 Map (java.util.Map)3 Slf4j (lombok.extern.slf4j.Slf4j)3 Path (org.apache.hadoop.fs.Path)3 SparkCatalog (org.apache.iceberg.spark.SparkCatalog)3 ReplaceTable (org.apache.spark.sql.catalyst.plans.logical.ReplaceTable)3 StructType (org.apache.spark.sql.types.StructType)3 ScalaConversionUtils (io.openlineage.spark.agent.util.ScalaConversionUtils)2 URI (java.net.URI)2