use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.
the class TestSparkTableUtilWithInMemoryCatalog method testImportPartitionsWithSnapshotInheritance.
@Test
public void testImportPartitionsWithSnapshotInheritance() throws IOException {
Table table = TABLES.create(SCHEMA, SPEC, tableLocation);
table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit();
List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
File parquetTableDir = temp.newFolder("parquet_table");
String parquetTableLocation = parquetTableDir.toURI().toString();
try {
Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class);
inputDF.select("id", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).partitionBy("data").saveAsTable("parquet_table");
File stagingDir = temp.newFolder("staging-dir");
List<SparkPartition> partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'");
SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString());
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
List<SimpleRecord> actualRecords = spark.read().format("iceberg").load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
Assert.assertEquals("Result rows should match", expectedRecords, actualRecords);
} finally {
spark.sql("DROP TABLE parquet_table");
}
}
use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.
the class Spark3Util method getPartitions.
/**
* Use Spark to list all partitions in the table.
*
* @param spark a Spark session
* @param rootPath a table identifier
* @param format format of the file
* @return all table's partitions
*/
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format) {
FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
Map<String, String> emptyMap = Collections.emptyMap();
InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), JavaConverters.mapAsScalaMapConverter(emptyMap).asScala().toMap(Predef.conforms()), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
StructType schema = spec.partitionColumns();
return JavaConverters.seqAsJavaListConverter(spec.partitions()).asJava().stream().map(partition -> {
Map<String, String> values = Maps.newHashMap();
JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
int fieldIndex = schema.fieldIndex(field.name());
Object catalystValue = partition.values().get(fieldIndex, field.dataType());
Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
values.put(field.name(), String.valueOf(value));
});
return new SparkPartition(values, partition.path().toString(), format);
}).collect(Collectors.toList());
}
use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.
the class TestSparkTableUtil method testSparkPartitionJavaSerialization.
@Test
public void testSparkPartitionJavaSerialization() throws IOException, ClassNotFoundException {
Map<String, String> values = ImmutableMap.of("id", "2");
String uri = "s3://bucket/table/data/id=2";
String format = "parquet";
SparkPartition sparkPartition = new SparkPartition(values, uri, format);
SparkPartition deserialized = TestHelpers.roundTripSerialize(sparkPartition);
Assertions.assertThat(sparkPartition).isEqualTo(deserialized);
}
use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.
the class TestSparkTableUtil method testSparkPartitionOKryoSerialization.
@Test
public void testSparkPartitionOKryoSerialization() throws IOException {
Map<String, String> values = ImmutableMap.of("id", "2");
String uri = "s3://bucket/table/data/id=2";
String format = "parquet";
SparkPartition sparkPartition = new SparkPartition(values, uri, format);
SparkPartition deserialized = KryoHelpers.roundTripSerialize(sparkPartition);
Assertions.assertThat(sparkPartition).isEqualTo(deserialized);
}
use of org.apache.iceberg.spark.SparkTableUtil.SparkPartition in project iceberg by apache.
the class TestSparkTableUtilWithInMemoryCatalog method testImportPartitions.
@Test
public void testImportPartitions() throws IOException {
Table table = TABLES.create(SCHEMA, SPEC, tableLocation);
List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
File parquetTableDir = temp.newFolder("parquet_table");
String parquetTableLocation = parquetTableDir.toURI().toString();
try {
Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class);
inputDF.select("id", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).partitionBy("data").saveAsTable("parquet_table");
File stagingDir = temp.newFolder("staging-dir");
List<SparkPartition> partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'");
SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString());
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
List<SimpleRecord> actualRecords = spark.read().format("iceberg").load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
Assert.assertEquals("Result rows should match", expectedRecords, actualRecords);
} finally {
spark.sql("DROP TABLE parquet_table");
}
}
Aggregations