Search in sources :

Example 1 with Table

use of org.apache.spark.sql.connector.catalog.Table in project iceberg by apache.

the class Spark3Util method getPartitions.

/**
 * Use Spark to list all partitions in the table.
 *
 * @param spark a Spark session
 * @param rootPath a table identifier
 * @param format format of the file
 * @return all table's partitions
 */
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format) {
    FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
    Map<String, String> emptyMap = Collections.emptyMap();
    InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), JavaConverters.mapAsScalaMapConverter(emptyMap).asScala().toMap(Predef.conforms()), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
    org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
    StructType schema = spec.partitionColumns();
    return JavaConverters.seqAsJavaListConverter(spec.partitions()).asJava().stream().map(partition -> {
        Map<String, String> values = Maps.newHashMap();
        JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
            int fieldIndex = schema.fieldIndex(field.name());
            Object catalystValue = partition.values().get(fieldIndex, field.dataType());
            Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
            values.put(field.name(), String.valueOf(value));
        });
        return new SparkPartition(values, partition.path().toString(), format);
    }).collect(Collectors.toList());
}
Also used : FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache) WRITE_DISTRIBUTION_MODE_RANGE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE) Distributions(org.apache.spark.sql.connector.iceberg.distributions.Distributions) Arrays(java.util.Arrays) DataSourceV2Relation(org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation) TypeUtil(org.apache.iceberg.types.TypeUtil) Types(org.apache.iceberg.types.Types) MetadataTableUtils(org.apache.iceberg.MetadataTableUtils) UpdateSchema(org.apache.iceberg.UpdateSchema) PartitionSpecVisitor(org.apache.iceberg.transforms.PartitionSpecVisitor) ByteBuffer(java.nio.ByteBuffer) TableOperations(org.apache.iceberg.TableOperations) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) SortOrder(org.apache.spark.sql.connector.iceberg.expressions.SortOrder) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) StructType(org.apache.spark.sql.types.StructType) Some(scala.Some) Term(org.apache.iceberg.expressions.Term) IntegerType(org.apache.spark.sql.types.IntegerType) Seq(scala.collection.Seq) SortOrderVisitor(org.apache.iceberg.transforms.SortOrderVisitor) Set(java.util.Set) LongType(org.apache.spark.sql.types.LongType) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) WRITE_DISTRIBUTION_MODE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) Type(org.apache.iceberg.types.Type) List(java.util.List) UpdateProperties(org.apache.iceberg.UpdateProperties) ExpressionVisitors(org.apache.iceberg.expressions.ExpressionVisitors) OrderedDistribution(org.apache.spark.sql.connector.iceberg.distributions.OrderedDistribution) Expressions(org.apache.spark.sql.connector.expressions.Expressions) DistributionMode(org.apache.iceberg.DistributionMode) PartitionSpec(org.apache.iceberg.PartitionSpec) JavaConverters(scala.collection.JavaConverters) TableProperties(org.apache.iceberg.TableProperties) Transform(org.apache.spark.sql.connector.expressions.Transform) ImmutableSet(org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet) Dataset(org.apache.spark.sql.Dataset) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) TableChange(org.apache.spark.sql.connector.catalog.TableChange) Pair(org.apache.iceberg.util.Pair) SortOrderUtil(org.apache.iceberg.util.SortOrderUtil) ParseException(org.apache.spark.sql.catalyst.parser.ParseException) BoundPredicate(org.apache.iceberg.expressions.BoundPredicate) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) Predef(scala.Predef) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) NullOrder(org.apache.iceberg.NullOrder) Namespace(org.apache.iceberg.catalog.Namespace) SparkSession(org.apache.spark.sql.SparkSession) CatalystTypeConverters(org.apache.spark.sql.catalyst.CatalystTypeConverters) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Literal(org.apache.spark.sql.connector.expressions.Literal) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) MetadataTableType(org.apache.iceberg.MetadataTableType) Row(org.apache.spark.sql.Row) Option(scala.Option) Joiner(org.apache.iceberg.relocated.com.google.common.base.Joiner) Distribution(org.apache.spark.sql.connector.iceberg.distributions.Distribution) Expression(org.apache.spark.sql.connector.expressions.Expression) CatalogPlugin(org.apache.spark.sql.connector.catalog.CatalogPlugin) Preconditions(org.apache.iceberg.relocated.com.google.common.base.Preconditions) UnboundPredicate(org.apache.iceberg.expressions.UnboundPredicate) Identifier(org.apache.spark.sql.connector.catalog.Identifier) ParserInterface(org.apache.spark.sql.catalyst.parser.ParserInterface) WRITE_DISTRIBUTION_MODE_NONE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE) Collections(java.util.Collections) SparkTable(org.apache.iceberg.spark.source.SparkTable) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) CatalogManager(org.apache.spark.sql.connector.catalog.CatalogManager) Table(org.apache.spark.sql.connector.catalog.Table) StructType(org.apache.spark.sql.types.StructType) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Map(java.util.Map) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache)

Example 2 with Table

use of org.apache.spark.sql.connector.catalog.Table in project iceberg by apache.

the class SparkSessionCatalog method stageCreateOrReplace.

@Override
public StagedTable stageCreateOrReplace(Identifier ident, StructType schema, Transform[] partitions, Map<String, String> properties) throws NoSuchNamespaceException {
    String provider = properties.get("provider");
    TableCatalog catalog;
    if (useIceberg(provider)) {
        if (asStagingCatalog != null) {
            return asStagingCatalog.stageCreateOrReplace(ident, schema, partitions, properties);
        }
        catalog = icebergCatalog;
    } else {
        catalog = getSessionCatalog();
    }
    // drop the table if it exists
    catalog.dropTable(ident);
    try {
        // create the table with the session catalog, then wrap it in a staged table that will delete to roll back
        Table sessionCatalogTable = catalog.createTable(ident, schema, partitions, properties);
        return new RollbackStagedTable(catalog, ident, sessionCatalogTable);
    } catch (TableAlreadyExistsException e) {
        // the table was deleted, but now already exists again. retry the replace.
        return stageCreateOrReplace(ident, schema, partitions, properties);
    }
}
Also used : TableAlreadyExistsException(org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException) StagedTable(org.apache.spark.sql.connector.catalog.StagedTable) Table(org.apache.spark.sql.connector.catalog.Table) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) StagingTableCatalog(org.apache.spark.sql.connector.catalog.StagingTableCatalog)

Example 3 with Table

use of org.apache.spark.sql.connector.catalog.Table in project iceberg by apache.

the class SparkSessionCatalog method stageCreate.

@Override
public StagedTable stageCreate(Identifier ident, StructType schema, Transform[] partitions, Map<String, String> properties) throws TableAlreadyExistsException, NoSuchNamespaceException {
    String provider = properties.get("provider");
    TableCatalog catalog;
    if (useIceberg(provider)) {
        if (asStagingCatalog != null) {
            return asStagingCatalog.stageCreate(ident, schema, partitions, properties);
        }
        catalog = icebergCatalog;
    } else {
        catalog = getSessionCatalog();
    }
    // create the table with the session catalog, then wrap it in a staged table that will delete to roll back
    Table table = catalog.createTable(ident, schema, partitions, properties);
    return new RollbackStagedTable(catalog, ident, table);
}
Also used : StagedTable(org.apache.spark.sql.connector.catalog.StagedTable) Table(org.apache.spark.sql.connector.catalog.Table) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) StagingTableCatalog(org.apache.spark.sql.connector.catalog.StagingTableCatalog)

Example 4 with Table

use of org.apache.spark.sql.connector.catalog.Table in project iceberg by apache.

the class BaseProcedure method loadSparkTable.

protected SparkTable loadSparkTable(Identifier ident) {
    try {
        Table table = tableCatalog.loadTable(ident);
        ValidationException.check(table instanceof SparkTable, "%s is not %s", ident, SparkTable.class.getName());
        return (SparkTable) table;
    } catch (NoSuchTableException e) {
        String errMsg = String.format("Couldn't load table '%s' in catalog '%s'", ident, tableCatalog.name());
        throw new RuntimeException(errMsg, e);
    }
}
Also used : SparkTable(org.apache.iceberg.spark.source.SparkTable) Table(org.apache.spark.sql.connector.catalog.Table) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) SparkTable(org.apache.iceberg.spark.source.SparkTable)

Example 5 with Table

use of org.apache.spark.sql.connector.catalog.Table in project OpenLineage by OpenLineage.

the class DeltaHandler method getDatasetVersion.

@SneakyThrows
public Optional<String> getDatasetVersion(TableCatalog tableCatalog, Identifier identifier, Map<String, String> properties) {
    DeltaCatalog deltaCatalog = (DeltaCatalog) tableCatalog;
    Table table = deltaCatalog.loadTable(identifier);
    if (table instanceof DeltaTableV2) {
        DeltaTableV2 deltaTable = (DeltaTableV2) table;
        return Optional.of(Long.toString(deltaTable.snapshot().version()));
    }
    return Optional.empty();
}
Also used : DeltaTableV2(org.apache.spark.sql.delta.catalog.DeltaTableV2) DeltaCatalog(org.apache.spark.sql.delta.catalog.DeltaCatalog) Table(org.apache.spark.sql.connector.catalog.Table) SneakyThrows(lombok.SneakyThrows)

Aggregations

Table (org.apache.spark.sql.connector.catalog.Table)10 TableCatalog (org.apache.spark.sql.connector.catalog.TableCatalog)7 SparkTable (org.apache.iceberg.spark.source.SparkTable)4 NoSuchTableException (org.apache.spark.sql.catalyst.analysis.NoSuchTableException)4 ByteBuffer (java.nio.ByteBuffer)2 Arrays (java.util.Arrays)2 List (java.util.List)2 Map (java.util.Map)2 Objects (java.util.Objects)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2 Path (org.apache.hadoop.fs.Path)2 MetadataTableType (org.apache.iceberg.MetadataTableType)2 MetadataTableUtils (org.apache.iceberg.MetadataTableUtils)2 NullOrder (org.apache.iceberg.NullOrder)2 PartitionSpec (org.apache.iceberg.PartitionSpec)2 Schema (org.apache.iceberg.Schema)2 TableOperations (org.apache.iceberg.TableOperations)2 TableProperties (org.apache.iceberg.TableProperties)2 UpdateProperties (org.apache.iceberg.UpdateProperties)2