use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class TestSparkSchema method testSparkReadSchemaIsHonored.
@Test
public void testSparkReadSchemaIsHonored() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
PartitionSpec spec = PartitionSpec.unpartitioned();
tables.create(SCHEMA, spec, null, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
StructType sparkReadSchema = new StructType(new StructField[] { new StructField("id", DataTypes.IntegerType, true, Metadata.empty()) });
Dataset<Row> resultDf = spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation);
Row[] results = (Row[]) resultDf.collect();
Assert.assertEquals("Result size matches", 1, results.length);
Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class TestSparkSchema method testSparkReadSchemaCombinedWithProjection.
@Test
public void testSparkReadSchemaCombinedWithProjection() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
PartitionSpec spec = PartitionSpec.unpartitioned();
tables.create(SCHEMA, spec, null, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
StructType sparkReadSchema = new StructType(new StructField[] { new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), new StructField("data", DataTypes.StringType, true, Metadata.empty()) });
Dataset<Row> resultDf = spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation).select("id");
Row[] results = (Row[]) resultDf.collect();
Assert.assertEquals("Result size matches", 1, results.length);
Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class TestSparkSchema method testFailSparkReadSchemaCombinedWithProjectionWhenSchemaDoesNotContainProjection.
@Test
public void testFailSparkReadSchemaCombinedWithProjectionWhenSchemaDoesNotContainProjection() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
PartitionSpec spec = PartitionSpec.unpartitioned();
tables.create(SCHEMA, spec, null, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
StructType sparkReadSchema = new StructType(new StructField[] { new StructField("data", DataTypes.StringType, true, Metadata.empty()) });
AssertHelpers.assertThrows("Spark should not allow a projection that is not included in the read schema", org.apache.spark.sql.AnalysisException.class, "cannot resolve '`id`' given input columns: [data]", () -> spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation).select("id"));
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class Spark3Util method getPartitions.
/**
* Use Spark to list all partitions in the table.
*
* @param spark a Spark session
* @param rootPath a table identifier
* @param format format of the file
* @return all table's partitions
*/
public static List<SparkPartition> getPartitions(SparkSession spark, Path rootPath, String format) {
FileStatusCache fileStatusCache = FileStatusCache.getOrCreate(spark);
Map<String, String> emptyMap = Collections.emptyMap();
InMemoryFileIndex fileIndex = new InMemoryFileIndex(spark, JavaConverters.collectionAsScalaIterableConverter(ImmutableList.of(rootPath)).asScala().toSeq(), JavaConverters.mapAsScalaMapConverter(emptyMap).asScala().toMap(Predef.conforms()), Option.empty(), fileStatusCache, Option.empty(), Option.empty());
org.apache.spark.sql.execution.datasources.PartitionSpec spec = fileIndex.partitionSpec();
StructType schema = spec.partitionColumns();
return JavaConverters.seqAsJavaListConverter(spec.partitions()).asJava().stream().map(partition -> {
Map<String, String> values = Maps.newHashMap();
JavaConverters.asJavaIterableConverter(schema).asJava().forEach(field -> {
int fieldIndex = schema.fieldIndex(field.name());
Object catalystValue = partition.values().get(fieldIndex, field.dataType());
Object value = CatalystTypeConverters.convertToScala(catalystValue, field.dataType());
values.put(field.name(), String.valueOf(value));
});
return new SparkPartition(values, partition.path().toString(), format);
}).collect(Collectors.toList());
}
use of org.apache.spark.sql.types.StructType$ in project iceberg by apache.
the class SparkTestTable method schema.
@Override
public StructType schema() {
StructType schema = super.schema();
if (metadataColumnNames != null) {
for (String columnName : metadataColumnNames) {
Types.NestedField metadataColumn = MetadataColumns.metadataColumn(table(), columnName);
schema = schema.add(columnName, SparkSchemaUtil.convert(metadataColumn.type()));
}
}
return schema;
}
Aggregations