use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class SchemaEvolutionTest method before.
@Before
public void before() throws IOException {
tableLocation = Files.createTempDirectory("temp").toFile();
Schema schema = new Schema(optional(1, "title", Types.StringType.get()), optional(2, "price", Types.IntegerType.get()), optional(3, "author", Types.StringType.get()), optional(4, "published", Types.TimestampType.withZone()), optional(5, "genre", Types.StringType.get()));
PartitionSpec spec = PartitionSpec.builderFor(schema).year("published").build();
HadoopTables tables = new HadoopTables(spark.sessionState().newHadoopConf());
table = tables.create(schema, spec, tableLocation.toString());
Dataset<Row> df = spark.read().json(dataLocation + "/books.json");
df.select(df.col("title"), df.col("price").cast(DataTypes.IntegerType), df.col("author"), df.col("published").cast(DataTypes.TimestampType), df.col("genre")).write().format("iceberg").mode("append").save(tableLocation.toString());
table.refresh();
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestSparkSchema method testFailIfSparkReadSchemaIsOff.
@Test
public void testFailIfSparkReadSchemaIsOff() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
PartitionSpec spec = PartitionSpec.unpartitioned();
tables.create(SCHEMA, spec, null, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
StructType sparkReadSchema = new StructType(new StructField[] { // wrong field name
new StructField("idd", DataTypes.IntegerType, true, Metadata.empty()) });
AssertHelpers.assertThrows("Iceberg should not allow a projection that contain unknown fields", java.lang.IllegalArgumentException.class, "Field idd not found in source schema", () -> spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation));
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestSparkSchema method testSparkReadSchemaIsHonored.
@Test
public void testSparkReadSchemaIsHonored() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
PartitionSpec spec = PartitionSpec.unpartitioned();
tables.create(SCHEMA, spec, null, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
StructType sparkReadSchema = new StructType(new StructField[] { new StructField("id", DataTypes.IntegerType, true, Metadata.empty()) });
Dataset<Row> resultDf = spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation);
Row[] results = (Row[]) resultDf.collect();
Assert.assertEquals("Result size matches", 1, results.length);
Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestSparkSchema method testSparkReadSchemaCombinedWithProjection.
@Test
public void testSparkReadSchemaCombinedWithProjection() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
PartitionSpec spec = PartitionSpec.unpartitioned();
tables.create(SCHEMA, spec, null, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
StructType sparkReadSchema = new StructType(new StructField[] { new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), new StructField("data", DataTypes.StringType, true, Metadata.empty()) });
Dataset<Row> resultDf = spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation).select("id");
Row[] results = (Row[]) resultDf.collect();
Assert.assertEquals("Result size matches", 1, results.length);
Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length());
Assert.assertEquals("Row content matches data", 1, results[0].getInt(0));
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestSparkSchema method testFailSparkReadSchemaCombinedWithProjectionWhenSchemaDoesNotContainProjection.
@Test
public void testFailSparkReadSchemaCombinedWithProjectionWhenSchemaDoesNotContainProjection() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();
HadoopTables tables = new HadoopTables(CONF);
PartitionSpec spec = PartitionSpec.unpartitioned();
tables.create(SCHEMA, spec, null, tableLocation);
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class);
originalDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
StructType sparkReadSchema = new StructType(new StructField[] { new StructField("data", DataTypes.StringType, true, Metadata.empty()) });
AssertHelpers.assertThrows("Spark should not allow a projection that is not included in the read schema", org.apache.spark.sql.AnalysisException.class, "cannot resolve '`id`' given input columns: [data]", () -> spark.read().schema(sparkReadSchema).format("iceberg").load(tableLocation).select("id"));
}
Aggregations