Search in sources :

Example 1 with Identifier

use of org.apache.spark.sql.connector.catalog.Identifier in project iceberg by apache.

the class Spark3Util method getPartitions.

/**
 * Use Spark to list all partitions in the table.
 *
 * @param spark a Spark session
 * @param rootPath a table identifier
 * @param format format of the file
 * @return all table's partitions
 */
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format) {
    FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
    Map<String, String> emptyMap = Collections.emptyMap();
    InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), JavaConverters.mapAsScalaMapConverter(emptyMap).asScala().toMap(Predef.conforms()), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
    org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
    StructType schema = spec.partitionColumns();
    return JavaConverters.seqAsJavaListConverter(spec.partitions()).asJava().stream().map(partition -> {
        Map<String, String> values = Maps.newHashMap();
        JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
            int fieldIndex = schema.fieldIndex(field.name());
            Object catalystValue = partition.values().get(fieldIndex, field.dataType());
            Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
            values.put(field.name(), String.valueOf(value));
        });
        return new SparkPartition(values, partition.path().toString(), format);
    }).collect(Collectors.toList());
}
Also used : FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache) WRITE_DISTRIBUTION_MODE_RANGE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE) Distributions(org.apache.spark.sql.connector.iceberg.distributions.Distributions) Arrays(java.util.Arrays) DataSourceV2Relation(org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation) TypeUtil(org.apache.iceberg.types.TypeUtil) Types(org.apache.iceberg.types.Types) MetadataTableUtils(org.apache.iceberg.MetadataTableUtils) UpdateSchema(org.apache.iceberg.UpdateSchema) PartitionSpecVisitor(org.apache.iceberg.transforms.PartitionSpecVisitor) ByteBuffer(java.nio.ByteBuffer) TableOperations(org.apache.iceberg.TableOperations) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) SortOrder(org.apache.spark.sql.connector.iceberg.expressions.SortOrder) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) StructType(org.apache.spark.sql.types.StructType) Some(scala.Some) Term(org.apache.iceberg.expressions.Term) IntegerType(org.apache.spark.sql.types.IntegerType) Seq(scala.collection.Seq) SortOrderVisitor(org.apache.iceberg.transforms.SortOrderVisitor) Set(java.util.Set) LongType(org.apache.spark.sql.types.LongType) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) WRITE_DISTRIBUTION_MODE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) Type(org.apache.iceberg.types.Type) List(java.util.List) UpdateProperties(org.apache.iceberg.UpdateProperties) ExpressionVisitors(org.apache.iceberg.expressions.ExpressionVisitors) OrderedDistribution(org.apache.spark.sql.connector.iceberg.distributions.OrderedDistribution) Expressions(org.apache.spark.sql.connector.expressions.Expressions) DistributionMode(org.apache.iceberg.DistributionMode) PartitionSpec(org.apache.iceberg.PartitionSpec) JavaConverters(scala.collection.JavaConverters) TableProperties(org.apache.iceberg.TableProperties) Transform(org.apache.spark.sql.connector.expressions.Transform) ImmutableSet(org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet) Dataset(org.apache.spark.sql.Dataset) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) TableChange(org.apache.spark.sql.connector.catalog.TableChange) Pair(org.apache.iceberg.util.Pair) SortOrderUtil(org.apache.iceberg.util.SortOrderUtil) ParseException(org.apache.spark.sql.catalyst.parser.ParseException) BoundPredicate(org.apache.iceberg.expressions.BoundPredicate) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) Predef(scala.Predef) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) NullOrder(org.apache.iceberg.NullOrder) Namespace(org.apache.iceberg.catalog.Namespace) SparkSession(org.apache.spark.sql.SparkSession) CatalystTypeConverters(org.apache.spark.sql.catalyst.CatalystTypeConverters) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Literal(org.apache.spark.sql.connector.expressions.Literal) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) MetadataTableType(org.apache.iceberg.MetadataTableType) Row(org.apache.spark.sql.Row) Option(scala.Option) Joiner(org.apache.iceberg.relocated.com.google.common.base.Joiner) Distribution(org.apache.spark.sql.connector.iceberg.distributions.Distribution) Expression(org.apache.spark.sql.connector.expressions.Expression) CatalogPlugin(org.apache.spark.sql.connector.catalog.CatalogPlugin) Preconditions(org.apache.iceberg.relocated.com.google.common.base.Preconditions) UnboundPredicate(org.apache.iceberg.expressions.UnboundPredicate) Identifier(org.apache.spark.sql.connector.catalog.Identifier) ParserInterface(org.apache.spark.sql.catalyst.parser.ParserInterface) WRITE_DISTRIBUTION_MODE_NONE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE) Collections(java.util.Collections) SparkTable(org.apache.iceberg.spark.source.SparkTable) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) CatalogManager(org.apache.spark.sql.connector.catalog.CatalogManager) Table(org.apache.spark.sql.connector.catalog.Table) StructType(org.apache.spark.sql.types.StructType) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Map(java.util.Map) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache)

Example 2 with Identifier

use of org.apache.spark.sql.connector.catalog.Identifier in project iceberg by apache.

the class Spark3Util method catalogAndIdentifier.

/**
 * A modified version of Spark's LookupCatalog.CatalogAndIdentifier.unapply
 * Attempts to find the catalog and identifier a multipart identifier represents
 * @param spark Spark session to use for resolution
 * @param nameParts Multipart identifier representing a table
 * @param defaultCatalog Catalog to use if none is specified
 * @return The CatalogPlugin and Identifier for the table
 */
public static CatalogAndIdentifier catalogAndIdentifier(SparkSession spark, List<String> nameParts, CatalogPlugin defaultCatalog) {
    CatalogManager catalogManager = spark.sessionState().catalogManager();
    String[] currentNamespace;
    if (defaultCatalog.equals(catalogManager.currentCatalog())) {
        currentNamespace = catalogManager.currentNamespace();
    } else {
        currentNamespace = defaultCatalog.defaultNamespace();
    }
    Pair<CatalogPlugin, Identifier> catalogIdentifier = SparkUtil.catalogAndIdentifier(nameParts, catalogName -> {
        try {
            return catalogManager.catalog(catalogName);
        } catch (Exception e) {
            return null;
        }
    }, Identifier::of, defaultCatalog, currentNamespace);
    return new CatalogAndIdentifier(catalogIdentifier);
}
Also used : CatalogPlugin(org.apache.spark.sql.connector.catalog.CatalogPlugin) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Identifier(org.apache.spark.sql.connector.catalog.Identifier) CatalogManager(org.apache.spark.sql.connector.catalog.CatalogManager) ParseException(org.apache.spark.sql.catalyst.parser.ParseException) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException)

Example 3 with Identifier

use of org.apache.spark.sql.connector.catalog.Identifier in project iceberg by apache.

the class RollbackToSnapshotProcedure method call.

@Override
public InternalRow[] call(InternalRow args) {
    Identifier tableIdent = toIdentifier(args.getString(0), PARAMETERS[0].name());
    long snapshotId = args.getLong(1);
    return modifyIcebergTable(tableIdent, table -> {
        Snapshot previousSnapshot = table.currentSnapshot();
        table.manageSnapshots().rollbackTo(snapshotId).commit();
        InternalRow outputRow = newInternalRow(previousSnapshot.snapshotId(), snapshotId);
        return new InternalRow[] { outputRow };
    });
}
Also used : Snapshot(org.apache.iceberg.Snapshot) Identifier(org.apache.spark.sql.connector.catalog.Identifier) InternalRow(org.apache.spark.sql.catalyst.InternalRow)

Example 4 with Identifier

use of org.apache.spark.sql.connector.catalog.Identifier in project iceberg by apache.

the class IcebergSource method getTable.

@Override
public Table getTable(StructType schema, Transform[] partitioning, Map<String, String> options) {
    Spark3Util.CatalogAndIdentifier catalogIdentifier = catalogAndIdentifier(new CaseInsensitiveStringMap(options));
    CatalogPlugin catalog = catalogIdentifier.catalog();
    Identifier ident = catalogIdentifier.identifier();
    try {
        if (catalog instanceof TableCatalog) {
            return ((TableCatalog) catalog).loadTable(ident);
        }
    } catch (NoSuchTableException e) {
        // throwing an iceberg NoSuchTableException because the Spark one is typed and cant be thrown from this interface
        throw new org.apache.iceberg.exceptions.NoSuchTableException(e, "Cannot find table for %s.", ident);
    }
    // throwing an iceberg NoSuchTableException because the Spark one is typed and cant be thrown from this interface
    throw new org.apache.iceberg.exceptions.NoSuchTableException("Cannot find table for %s.", ident);
}
Also used : CatalogPlugin(org.apache.spark.sql.connector.catalog.CatalogPlugin) PathIdentifier(org.apache.iceberg.spark.PathIdentifier) Identifier(org.apache.spark.sql.connector.catalog.Identifier) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) Spark3Util(org.apache.iceberg.spark.Spark3Util)

Example 5 with Identifier

use of org.apache.spark.sql.connector.catalog.Identifier in project iceberg by apache.

the class IcebergSource method catalogAndIdentifier.

private Spark3Util.CatalogAndIdentifier catalogAndIdentifier(CaseInsensitiveStringMap options) {
    Preconditions.checkArgument(options.containsKey("path"), "Cannot open table: path is not set");
    SparkSession spark = SparkSession.active();
    setupDefaultSparkCatalog(spark);
    String path = options.get("path");
    Long snapshotId = propertyAsLong(options, SparkReadOptions.SNAPSHOT_ID);
    Long asOfTimestamp = propertyAsLong(options, SparkReadOptions.AS_OF_TIMESTAMP);
    Preconditions.checkArgument(asOfTimestamp == null || snapshotId == null, "Cannot specify both snapshot-id (%s) and as-of-timestamp (%s)", snapshotId, asOfTimestamp);
    String selector = null;
    if (snapshotId != null) {
        selector = SNAPSHOT_ID + snapshotId;
    }
    if (asOfTimestamp != null) {
        selector = AT_TIMESTAMP + asOfTimestamp;
    }
    CatalogManager catalogManager = spark.sessionState().catalogManager();
    if (path.contains("/")) {
        // contains a path. Return iceberg default catalog and a PathIdentifier
        String newPath = (selector == null) ? path : path + "#" + selector;
        return new Spark3Util.CatalogAndIdentifier(catalogManager.catalog(DEFAULT_CATALOG_NAME), new PathIdentifier(newPath));
    }
    final Spark3Util.CatalogAndIdentifier catalogAndIdentifier = Spark3Util.catalogAndIdentifier("path or identifier", spark, path);
    Identifier ident = identifierWithSelector(catalogAndIdentifier.identifier(), selector);
    if (catalogAndIdentifier.catalog().name().equals("spark_catalog") && !(catalogAndIdentifier.catalog() instanceof SparkSessionCatalog)) {
        // catalog is a session catalog but does not support Iceberg. Use Iceberg instead.
        return new Spark3Util.CatalogAndIdentifier(catalogManager.catalog(DEFAULT_CATALOG_NAME), ident);
    } else {
        return new Spark3Util.CatalogAndIdentifier(catalogAndIdentifier.catalog(), ident);
    }
}
Also used : SparkSessionCatalog(org.apache.iceberg.spark.SparkSessionCatalog) SparkSession(org.apache.spark.sql.SparkSession) PathIdentifier(org.apache.iceberg.spark.PathIdentifier) Identifier(org.apache.spark.sql.connector.catalog.Identifier) PathIdentifier(org.apache.iceberg.spark.PathIdentifier) CatalogManager(org.apache.spark.sql.connector.catalog.CatalogManager) Spark3Util(org.apache.iceberg.spark.Spark3Util)

Aggregations

Identifier (org.apache.spark.sql.connector.catalog.Identifier)34 TableCatalog (org.apache.spark.sql.connector.catalog.TableCatalog)13 SparkTable (org.apache.iceberg.spark.source.SparkTable)11 Map (java.util.Map)10 DatasetIdentifier (io.openlineage.spark.agent.util.DatasetIdentifier)7 Maps (org.apache.iceberg.relocated.com.google.common.collect.Maps)7 SparkCatalog (org.apache.iceberg.spark.SparkCatalog)7 Transform (org.apache.spark.sql.connector.expressions.Transform)7 Test (org.junit.Test)7 DeleteOrphanFiles (org.apache.iceberg.actions.DeleteOrphanFiles)6 SparkSessionCatalog (org.apache.iceberg.spark.SparkSessionCatalog)6 SparkSession (org.apache.spark.sql.SparkSession)6 CatalogManager (org.apache.spark.sql.connector.catalog.CatalogManager)6 File (java.io.File)5 StreamSupport (java.util.stream.StreamSupport)5 Path (org.apache.hadoop.fs.Path)5 SparkSchemaUtil (org.apache.iceberg.spark.SparkSchemaUtil)5 InternalRow (org.apache.spark.sql.catalyst.InternalRow)5 After (org.junit.After)5 Assert (org.junit.Assert)5