Search in sources :

Example 1 with SparkPartition

use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.

the class TestSparkTableUtilWithInMemoryCatalog method testImportPartitionsWithSnapshotInheritance.

@Test
public void testImportPartitionsWithSnapshotInheritance() throws IOException {
    Table table = TABLES.create(SCHEMA, SPEC, tableLocation);
    table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit();
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    File parquetTableDir = temp.newFolder("parquet_table");
    String parquetTableLocation = parquetTableDir.toURI().toString();
    try {
        Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class);
        inputDF.select("id", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).partitionBy("data").saveAsTable("parquet_table");
        File stagingDir = temp.newFolder("staging-dir");
        List<SparkPartition> partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'");
        SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString());
        List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
        List<SimpleRecord> actualRecords = spark.read().format("iceberg").load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
        Assert.assertEquals("Result rows should match", expectedRecords, actualRecords);
    } finally {
        spark.sql("DROP TABLE parquet_table");
    }
}
Also used : SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Table(org.apache.iceberg.Table) Row(org.apache.spark.sql.Row) File(java.io.File) Test(org.junit.Test)

Example 2 with SparkPartition

use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.

the class Spark3Util method getPartitions.

/**
 * Use Spark to list all partitions in the table.
 *
 * @param spark a Spark session
 * @param rootPath a table identifier
 * @param format format of the file
 * @return all table's partitions
 */
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format) {
    FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
    Map<String, String> emptyMap = Collections.emptyMap();
    InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), JavaConverters.mapAsScalaMapConverter(emptyMap).asScala().toMap(Predef.conforms()), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
    org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
    StructType schema = spec.partitionColumns();
    return JavaConverters.seqAsJavaListConverter(spec.partitions()).asJava().stream().map(partition -> {
        Map<String, String> values = Maps.newHashMap();
        JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
            int fieldIndex = schema.fieldIndex(field.name());
            Object catalystValue = partition.values().get(fieldIndex, field.dataType());
            Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
            values.put(field.name(), String.valueOf(value));
        });
        return new SparkPartition(values, partition.path().toString(), format);
    }).collect(Collectors.toList());
}
Also used : FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache) WRITE_DISTRIBUTION_MODE_RANGE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_RANGE) Distributions(org.apache.spark.sql.connector.iceberg.distributions.Distributions) Arrays(java.util.Arrays) DataSourceV2Relation(org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation) TypeUtil(org.apache.iceberg.types.TypeUtil) Types(org.apache.iceberg.types.Types) MetadataTableUtils(org.apache.iceberg.MetadataTableUtils) UpdateSchema(org.apache.iceberg.UpdateSchema) PartitionSpecVisitor(org.apache.iceberg.transforms.PartitionSpecVisitor) ByteBuffer(java.nio.ByteBuffer) TableOperations(org.apache.iceberg.TableOperations) TableCatalog(org.apache.spark.sql.connector.catalog.TableCatalog) SortOrder(org.apache.spark.sql.connector.iceberg.expressions.SortOrder) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) StructType(org.apache.spark.sql.types.StructType) Some(scala.Some) Term(org.apache.iceberg.expressions.Term) IntegerType(org.apache.spark.sql.types.IntegerType) Seq(scala.collection.Seq) SortOrderVisitor(org.apache.iceberg.transforms.SortOrderVisitor) Set(java.util.Set) LongType(org.apache.spark.sql.types.LongType) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) WRITE_DISTRIBUTION_MODE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) Type(org.apache.iceberg.types.Type) List(java.util.List) UpdateProperties(org.apache.iceberg.UpdateProperties) ExpressionVisitors(org.apache.iceberg.expressions.ExpressionVisitors) OrderedDistribution(org.apache.spark.sql.connector.iceberg.distributions.OrderedDistribution) Expressions(org.apache.spark.sql.connector.expressions.Expressions) DistributionMode(org.apache.iceberg.DistributionMode) PartitionSpec(org.apache.iceberg.PartitionSpec) JavaConverters(scala.collection.JavaConverters) TableProperties(org.apache.iceberg.TableProperties) Transform(org.apache.spark.sql.connector.expressions.Transform) ImmutableSet(org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet) Dataset(org.apache.spark.sql.Dataset) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) TableChange(org.apache.spark.sql.connector.catalog.TableChange) Pair(org.apache.iceberg.util.Pair) SortOrderUtil(org.apache.iceberg.util.SortOrderUtil) ParseException(org.apache.spark.sql.catalyst.parser.ParseException) BoundPredicate(org.apache.iceberg.expressions.BoundPredicate) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) Predef(scala.Predef) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) NullOrder(org.apache.iceberg.NullOrder) Namespace(org.apache.iceberg.catalog.Namespace) SparkSession(org.apache.spark.sql.SparkSession) CatalystTypeConverters(org.apache.spark.sql.catalyst.CatalystTypeConverters) TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Literal(org.apache.spark.sql.connector.expressions.Literal) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) MetadataTableType(org.apache.iceberg.MetadataTableType) Row(org.apache.spark.sql.Row) Option(scala.Option) Joiner(org.apache.iceberg.relocated.com.google.common.base.Joiner) Distribution(org.apache.spark.sql.connector.iceberg.distributions.Distribution) Expression(org.apache.spark.sql.connector.expressions.Expression) CatalogPlugin(org.apache.spark.sql.connector.catalog.CatalogPlugin) Preconditions(org.apache.iceberg.relocated.com.google.common.base.Preconditions) UnboundPredicate(org.apache.iceberg.expressions.UnboundPredicate) Identifier(org.apache.spark.sql.connector.catalog.Identifier) ParserInterface(org.apache.spark.sql.catalyst.parser.ParserInterface) WRITE_DISTRIBUTION_MODE_NONE(org.apache.iceberg.TableProperties.WRITE_DISTRIBUTION_MODE_NONE) Collections(java.util.Collections) SparkTable(org.apache.iceberg.spark.source.SparkTable) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) CatalogManager(org.apache.spark.sql.connector.catalog.CatalogManager) Table(org.apache.spark.sql.connector.catalog.Table) StructType(org.apache.spark.sql.types.StructType) InMemoryFileIndex(org.apache.spark.sql.execution.datasources.InMemoryFileIndex) SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Map(java.util.Map) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) CaseInsensitiveStringMap(org.apache.spark.sql.util.CaseInsensitiveStringMap) FileStatusCache(org.apache.spark.sql.execution.datasources.FileStatusCache)

Example 3 with SparkPartition

use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.

the class TestSparkTableUtil method testSparkPartitionJavaSerialization.

@Test
public void testSparkPartitionJavaSerialization() throws IOException, ClassNotFoundException {
    Map<String, String> values = ImmutableMap.of("id", "2");
    String uri = "s3://bucket/table/data/id=2";
    String format = "parquet";
    SparkPartition sparkPartition = new SparkPartition(values, uri, format);
    SparkPartition deserialized = TestHelpers.roundTripSerialize(sparkPartition);
    Assertions.assertThat(sparkPartition).isEqualTo(deserialized);
}
Also used : SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Test(org.junit.Test)

Example 4 with SparkPartition

use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.

the class TestSparkTableUtil method testSparkPartitionOKryoSerialization.

@Test
public void testSparkPartitionOKryoSerialization() throws IOException {
    Map<String, String> values = ImmutableMap.of("id", "2");
    String uri = "s3://bucket/table/data/id=2";
    String format = "parquet";
    SparkPartition sparkPartition = new SparkPartition(values, uri, format);
    SparkPartition deserialized = KryoHelpers.roundTripSerialize(sparkPartition);
    Assertions.assertThat(sparkPartition).isEqualTo(deserialized);
}
Also used : SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Test(org.junit.Test)

Example 5 with SparkPartition

use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.

the class TestSparkTableUtilWithInMemoryCatalog method testImportPartitions.

@Test
public void testImportPartitions() throws IOException {
    Table table = TABLES.create(SCHEMA, SPEC, tableLocation);
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    File parquetTableDir = temp.newFolder("parquet_table");
    String parquetTableLocation = parquetTableDir.toURI().toString();
    try {
        Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class);
        inputDF.select("id", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).partitionBy("data").saveAsTable("parquet_table");
        File stagingDir = temp.newFolder("staging-dir");
        List<SparkPartition> partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'");
        SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString());
        List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
        List<SimpleRecord> actualRecords = spark.read().format("iceberg").load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
        Assert.assertEquals("Result rows should match", expectedRecords, actualRecords);
    } finally {
        spark.sql("DROP TABLE parquet_table");
    }
}
Also used : SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Table(org.apache.iceberg.Table) Row(org.apache.spark.sql.Row) File(java.io.File) Test(org.junit.Test)

Aggregations

SparkPartition (org.apache.iceberg.spark.SparkTableUtil.SparkPartition)7 Row (org.apache.spark.sql.Row)4 Test (org.junit.Test)3 File (java.io.File)2 ByteBuffer (java.nio.ByteBuffer)2 Arrays (java.util.Arrays)2 List (java.util.List)2 Map (java.util.Map)2 Objects (java.util.Objects)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2 Path (org.apache.hadoop.fs.Path)2 MetadataTableType (org.apache.iceberg.MetadataTableType)2 MetadataTableUtils (org.apache.iceberg.MetadataTableUtils)2 NullOrder (org.apache.iceberg.NullOrder)2 PartitionSpec (org.apache.iceberg.PartitionSpec)2 Schema (org.apache.iceberg.Schema)2 Table (org.apache.iceberg.Table)2 TableOperations (org.apache.iceberg.TableOperations)2 TableProperties (org.apache.iceberg.TableProperties)2