Search in sources :

Example 6 with SparkPartition

use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.

the class Spark3Util method getPartitions.

/**
 * Use Spark to list all partitions in the table.
 *
 * @param spark a Spark session
 * @param rootPath a table identifier
 * @param format format of the file
 * @param partitionFilter partitionFilter of the file
 * @return all table's partitions
 */
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format, Map<String, String> partitionFilter) {
    FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
    InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), scala.collection.immutable.Map$.MODULE$.<String, String>empty(), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
    org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
    StructType schema = spec.partitionColumns();
    if (schema.isEmpty()) {
        return Lists.newArrayList();
    }
    List<org.apache.spark.sql.catalyst.expressions.Expression> filterExpressions = SparkUtil.partitionMapToExpression(schema, partitionFilter);
    Seq<org.apache.spark.sql.catalyst.expressions.Expression> scalaPartitionFilters = JavaConverters.asScalaBufferConverter(filterExpressions).asScala().toIndexedSeq();
    List<org.apache.spark.sql.catalyst.expressions.Expression> dataFilters = Lists.newArrayList();
    Seq<org.apache.spark.sql.catalyst.expressions.Expression> scalaDataFilters = JavaConverters.asScalaBufferConverter(dataFilters).asScala().toIndexedSeq();
    Seq<PartitionDirectory> filteredPartitions = fileIndex.listFiles(scalaPartitionFilters, scalaDataFilters).toIndexedSeq();
    return JavaConverters.seqAsJavaListConverter(filteredPartitions).asJava().stream().map(partition -> {
        Map<String, String> values = Maps.newHashMap();
        JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
            int fieldIndex = schema.fieldIndex(field.name());
            Object catalystValue = partition.values().get(fieldIndex, field.dataType());
            Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
            values.put(field.name(), String.valueOf(value));
        });
        FileStatus fileStatus = JavaConverters.seqAsJavaListConverter(partition.files()).asJava().get(0);
        return new SparkPartition(values, fileStatus.getPath().getParent().toString(), format);
    }).collect(Collectors.toList());
}
Also used : FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache) Arrays(java.util.Arrays) DataSourceV2Relation(org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation) TypeUtil(org.apache.iceberg.types.TypeUtil) Types(org.apache.iceberg.types.Types) MetadataTableUtils(org.apache.iceberg.MetadataTableUtils) UpdateSchema(org.apache.iceberg.UpdateSchema) FileStatus(org.apache.hadoop.fs.FileStatus) PartitionSpecVisitor(org.apache.iceberg.transforms.PartitionSpecVisitor) ByteBuffer(java.nio.ByteBuffer) TableOperations(org.apache.iceberg.TableOperations) Lists(org.apache.iceberg.relocated.com.google.common.collect.Lists) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) ByteBuffers(org.apache.iceberg.util.ByteBuffers) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) StructType(org.apache.spark.sql.types.StructType) Some(scala.Some) Term(org.apache.iceberg.expressions.Term) IntegerType(org.apache.spark.sql.types.IntegerType) SortOrderVisitor(org.apache.iceberg.transforms.SortOrderVisitor) Set(java.util.Set) LongType(org.apache.spark.sql.types.LongType) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) Type(org.apache.iceberg.types.Type) List(java.util.List) UpdateProperties(org.apache.iceberg.UpdateProperties) ExpressionVisitors(org.apache.iceberg.expressions.ExpressionVisitors) Expressions(org.apache.spark.sql.connector.expressions.Expressions) PartitionSpec(org.apache.iceberg.PartitionSpec) JavaConverters(scala.collection.JavaConverters) TableProperties(org.apache.iceberg.TableProperties) Transform(org.apache.spark.sql.connector.expressions.Transform) Seq(scala.collection.immutable.Seq) ImmutableSet(org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet) Dataset(org.apache.spark.sql.Dataset) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) PartitionDirectory(org.apache.spark.sql.execution.datasources.PartitionDirectory) TableChange(org.apache.spark.sql.connector.catalog.TableChange) Pair(org.apache.iceberg.util.Pair) BaseEncoding(org.apache.iceberg.relocated.com.google.common.io.BaseEncoding) ParseException(org.apache.spark.sql.catalyst.parser.ParseException) BoundPredicate(org.apache.iceberg.expressions.BoundPredicate) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) NullOrder(org.apache.iceberg.NullOrder) Namespace(org.apache.iceberg.catalog.Namespace) SparkSession(org.apache.spark.sql.SparkSession) CatalystTypeConverters(org.apache.spark.sql.catalyst.CatalystTypeConverters) NamedReference(org.apache.spark.sql.connector.expressions.NamedReference) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Literal(org.apache.spark.sql.connector.expressions.Literal) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) MetadataTableType(org.apache.iceberg.MetadataTableType) Row(org.apache.spark.sql.Row) Option(scala.Option) Joiner(org.apache.iceberg.relocated.com.google.common.base.Joiner) Expression(org.apache.spark.sql.connector.expressions.Expression) CatalogPlugin(org.apache.spark.sql.connector.catalog.CatalogPlugin) Preconditions(org.apache.iceberg.relocated.com.google.common.base.Preconditions) UnboundPredicate(org.apache.iceberg.expressions.UnboundPredicate) Identifier(org.apache.spark.sql.connector.catalog.Identifier) ParserInterface(org.apache.spark.sql.catalyst.parser.ParserInterface) SparkTable(org.apache.iceberg.spark.source.SparkTable) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) CatalogManager(org.apache.spark.sql.connector.catalog.CatalogManager) Table(org.apache.spark.sql.connector.catalog.Table) FileStatus(org.apache.hadoop.fs.FileStatus) StructType(org.apache.spark.sql.types.StructType) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) PartitionDirectory(org.apache.spark.sql.execution.datasources.PartitionDirectory) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Expression(org.apache.spark.sql.connector.expressions.Expression) Map(java.util.Map) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache)

Example 7 with SparkPartition

use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.

the class AddFilesProcedure method importFileTable.

private void importFileTable(Table table, Path tableLocation, String format, Map<String, String> partitionFilter, boolean checkDuplicateFiles) {
    // List Partitions via Spark InMemory file search interface
    List<SparkPartition> partitions = Spark3Util.getPartitions(spark(), tableLocation, format, partitionFilter);
    if (table.spec().isUnpartitioned()) {
        Preconditions.checkArgument(partitions.isEmpty(), "Cannot add partitioned files to an unpartitioned table");
        Preconditions.checkArgument(partitionFilter.isEmpty(), "Cannot use a partition filter when importing" + "to an unpartitioned table");
        // Build a Global Partition for the source
        SparkPartition partition = new SparkPartition(Collections.emptyMap(), tableLocation.toString(), format);
        importPartitions(table, ImmutableList.of(partition), checkDuplicateFiles);
    } else {
        Preconditions.checkArgument(!partitions.isEmpty(), "Cannot find any matching partitions in table %s", partitions);
        importPartitions(table, partitions, checkDuplicateFiles);
    }
}
Also used : SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition)

Aggregations

SparkPartition (org.apache.iceberg.spark.SparkTableUtil.SparkPartition)7 Row (org.apache.spark.sql.Row)4 Test (org.junit.Test)3 File (java.io.File)2 ByteBuffer (java.nio.ByteBuffer)2 Arrays (java.util.Arrays)2 List (java.util.List)2 Map (java.util.Map)2 Objects (java.util.Objects)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2 Path (org.apache.hadoop.fs.Path)2 MetadataTableType (org.apache.iceberg.MetadataTableType)2 MetadataTableUtils (org.apache.iceberg.MetadataTableUtils)2 NullOrder (org.apache.iceberg.NullOrder)2 PartitionSpec (org.apache.iceberg.PartitionSpec)2 Schema (org.apache.iceberg.Schema)2 Table (org.apache.iceberg.Table)2 TableOperations (org.apache.iceberg.TableOperations)2 TableProperties (org.apache.iceberg.TableProperties)2