use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestNameMappingProjection method assertNameMappingProjection.
private void assertNameMappingProjection(DataFile dataFile, String tableName) {
Schema filteredSchema = new Schema(required(1, "name", Types.StringType.get()));
NameMapping nameMapping = MappingUtil.create(filteredSchema);
Schema tableSchema = new Schema(required(1, "name", Types.StringType.get()), optional(2, "id", Types.IntegerType.get()));
Table table = catalog.createTable(org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, tableName), tableSchema, PartitionSpec.unpartitioned());
table.updateProperties().set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)).commit();
table.newFastAppend().appendFile(dataFile).commit();
List<Row> actual = spark.read().format("iceberg").load(String.format("%s.%s", DB_NAME, tableName)).filter("name='Alice'").collectAsList();
Assert.assertEquals("Should project 1 record", 1, actual.size());
Assert.assertEquals("Should equal to 'Alice'", "Alice", actual.get(0).getString(0));
Assert.assertNull("should be null", actual.get(0).get(1));
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestSparkTableUtilWithInMemoryCatalog method testImportPartitionsWithSnapshotInheritance.
@Test
public void testImportPartitionsWithSnapshotInheritance() throws IOException {
Table table = TABLES.create(SCHEMA, SPEC, tableLocation);
table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit();
List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
File parquetTableDir = temp.newFolder("parquet_table");
String parquetTableLocation = parquetTableDir.toURI().toString();
try {
Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class);
inputDF.select("id", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).partitionBy("data").saveAsTable("parquet_table");
File stagingDir = temp.newFolder("staging-dir");
List<SparkPartition> partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'");
SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString());
List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
List<SimpleRecord> actualRecords = spark.read().format("iceberg").load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
Assert.assertEquals("Result rows should match", expectedRecords, actualRecords);
} finally {
spark.sql("DROP TABLE parquet_table");
}
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestSparkTableUtilWithInMemoryCatalog method testImportTableWithMappingForNestedDataPartitionedTable.
@Test
public void testImportTableWithMappingForNestedDataPartitionedTable() throws IOException {
File parquetTableDir = temp.newFolder("parquet_table");
String parquetTableLocation = parquetTableDir.toURI().toString();
try {
Dataset<Row> df1 = spark.range(1, 2).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')")).withColumn("data", functions.lit("Z"));
Dataset<Row> df2 = spark.range(2, 3).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')")).withColumn("data", functions.lit("Z"));
df1.union(df2).coalesce(1).select("id", "extra_col", "struct", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).partitionBy("data").saveAsTable("parquet_table");
// don't include `extra_col` and `nested_2` on purpose
Schema schema = new Schema(optional(1, "id", Types.LongType.get()), required(2, "struct", Types.StructType.of(required(4, "nested_1", Types.StringType.get()), required(5, "nested_3", Types.StringType.get()))), required(3, "data", Types.StringType.get()));
PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build();
Table table = TABLES.create(schema, spec, tableLocation);
// assign a custom metrics config and a name mapping
NameMapping nameMapping = MappingUtil.create(schema);
table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "id", "full").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "struct.nested_3", "full").set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)).commit();
File stagingDir = temp.newFolder("staging-dir");
SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
// validate we get the expected results back
List<Row> expected = spark.table("parquet_table").select("id", "struct.nested_1", "struct.nested_3", "data").collectAsList();
List<Row> actual = spark.read().format("iceberg").load(tableLocation).select("id", "struct.nested_1", "struct.nested_3", "data").collectAsList();
Assert.assertEquals("Rows must match", expected, actual);
// validate we persisted correct metrics
Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
List<Row> bounds = fileDF.select("lower_bounds", "upper_bounds").collectAsList();
Assert.assertEquals("Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size());
Assert.assertEquals("Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size());
Types.NestedField nestedField1 = table.schema().findField("struct.nested_1");
checkFieldMetrics(fileDF, nestedField1, true);
Types.NestedField id = table.schema().findField("id");
checkFieldMetrics(fileDF, id, 1L, 2L);
Types.NestedField nestedField3 = table.schema().findField("struct.nested_3");
checkFieldMetrics(fileDF, nestedField3, "f", "g");
} finally {
spark.sql("DROP TABLE parquet_table");
}
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestSparkTableUtilWithInMemoryCatalog method testImportTableWithInt96Timestamp.
@Test
public void testImportTableWithInt96Timestamp() throws IOException {
File parquetTableDir = temp.newFolder("parquet_table");
String parquetTableLocation = parquetTableDir.toURI().toString();
try {
spark.conf().set(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE().key(), "INT96");
Column timestampColumn = functions.to_timestamp(functions.lit("2010-03-20 10:40:30.1234"));
Dataset<Row> df = spark.range(1, 10).withColumn("tmp_col", timestampColumn);
df.coalesce(1).select("id", "tmp_col").write().format("parquet").mode("append").option("path", parquetTableLocation).saveAsTable("parquet_table");
Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "tmp_col", Types.TimestampType.withZone()));
Table table = TABLES.create(schema, PartitionSpec.unpartitioned(), tableLocation);
// assign a custom metrics config and disable vectorized reads
table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full").set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "false").commit();
File stagingDir = temp.newFolder("staging-dir");
SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
// validate we get the expected results back
List<Row> expected = spark.table("parquet_table").select("id", "tmp_col").collectAsList();
List<Row> actual = spark.read().format("iceberg").load(tableLocation).select("id", "tmp_col").collectAsList();
Assert.assertEquals("Rows must match", expected, actual);
// validate we did not persist metrics for INT96
Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
Types.NestedField timestampField = table.schema().findField("tmp_col");
checkFieldMetrics(fileDF, timestampField, true);
Types.NestedField idField = table.schema().findField("id");
checkFieldMetrics(fileDF, idField, 1L, 9L);
} finally {
spark.sql("DROP TABLE parquet_table");
}
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestIcebergExpressions method testTruncateExpressions.
@Test
public void testTruncateExpressions() {
sql("CREATE TABLE %s ( " + " int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY " + ") USING iceberg", tableName);
sql("CREATE TEMPORARY VIEW emp " + "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) " + "AS EMP(int_c, long_c, dec_c, str_c, binary_c)");
sql("INSERT INTO %s SELECT * FROM emp", tableName);
Dataset<Row> df = spark.sql("SELECT * FROM " + tableName);
df.select(new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"), new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"), new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"), new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"), new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c")).createOrReplaceTempView("v");
assertEquals("Should have expected rows", ImmutableList.of(row(100, 10000L, new BigDecimal("10.50"), "10", "12")), sql("SELECT int_c, long_c, dec_c, str_c, CAST(binary_c AS STRING) FROM v"));
}
Aggregations