Search in sources :

Example 11 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestNameMappingProjection method assertNameMappingProjection.

private void assertNameMappingProjection(DataFile dataFile, String tableName) {
    Schema filteredSchema = new Schema(required(1, "name", Types.StringType.get()));
    NameMapping nameMapping = MappingUtil.create(filteredSchema);
    Schema tableSchema = new Schema(required(1, "name", Types.StringType.get()), optional(2, "id", Types.IntegerType.get()));
    Table table = catalog.createTable(org.apache.iceberg.catalog.TableIdentifier.of(DB_NAME, tableName), tableSchema, PartitionSpec.unpartitioned());
    table.updateProperties().set(DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)).commit();
    table.newFastAppend().appendFile(dataFile).commit();
    List<Row> actual = spark.read().format("iceberg").load(String.format("%s.%s", DB_NAME, tableName)).filter("name='Alice'").collectAsList();
    Assert.assertEquals("Should project 1 record", 1, actual.size());
    Assert.assertEquals("Should equal to 'Alice'", "Alice", actual.get(0).getString(0));
    Assert.assertNull("should be null", actual.get(0).get(1));
}
Also used : NameMapping(org.apache.iceberg.mapping.NameMapping) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) Row(org.apache.spark.sql.Row)

Example 12 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestSparkTableUtilWithInMemoryCatalog method testImportPartitionsWithSnapshotInheritance.

@Test
public void testImportPartitionsWithSnapshotInheritance() throws IOException {
    Table table = TABLES.create(SCHEMA, SPEC, tableLocation);
    table.updateProperties().set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true").commit();
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    File parquetTableDir = temp.newFolder("parquet_table");
    String parquetTableLocation = parquetTableDir.toURI().toString();
    try {
        Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class);
        inputDF.select("id", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).partitionBy("data").saveAsTable("parquet_table");
        File stagingDir = temp.newFolder("staging-dir");
        List<SparkPartition> partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'");
        SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString());
        List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));
        List<SimpleRecord> actualRecords = spark.read().format("iceberg").load(tableLocation).orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
        Assert.assertEquals("Result rows should match", expectedRecords, actualRecords);
    } finally {
        spark.sql("DROP TABLE parquet_table");
    }
}
Also used : SparkPartition(org.apache.iceberg.spark.SparkTableUtil.SparkPartition) Table(org.apache.iceberg.Table) Row(org.apache.spark.sql.Row) File(java.io.File) Test(org.junit.Test)

Example 13 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestSparkTableUtilWithInMemoryCatalog method testImportTableWithMappingForNestedDataPartitionedTable.

@Test
public void testImportTableWithMappingForNestedDataPartitionedTable() throws IOException {
    File parquetTableDir = temp.newFolder("parquet_table");
    String parquetTableLocation = parquetTableDir.toURI().toString();
    try {
        Dataset<Row> df1 = spark.range(1, 2).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'a', 'nested_2', 'd', 'nested_3', 'f')")).withColumn("data", functions.lit("Z"));
        Dataset<Row> df2 = spark.range(2, 3).withColumn("extra_col", functions.lit(-1)).withColumn("struct", functions.expr("named_struct('nested_1', 'b', 'nested_2', 'e', 'nested_3', 'g')")).withColumn("data", functions.lit("Z"));
        df1.union(df2).coalesce(1).select("id", "extra_col", "struct", "data").write().format("parquet").mode("append").option("path", parquetTableLocation).partitionBy("data").saveAsTable("parquet_table");
        // don't include `extra_col` and `nested_2` on purpose
        Schema schema = new Schema(optional(1, "id", Types.LongType.get()), required(2, "struct", Types.StructType.of(required(4, "nested_1", Types.StringType.get()), required(5, "nested_3", Types.StringType.get()))), required(3, "data", Types.StringType.get()));
        PartitionSpec spec = PartitionSpec.builderFor(schema).identity("data").build();
        Table table = TABLES.create(schema, spec, tableLocation);
        // assign a custom metrics config and a name mapping
        NameMapping nameMapping = MappingUtil.create(schema);
        table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "id", "full").set(TableProperties.METRICS_MODE_COLUMN_CONF_PREFIX + "struct.nested_3", "full").set(TableProperties.DEFAULT_NAME_MAPPING, NameMappingParser.toJson(nameMapping)).commit();
        File stagingDir = temp.newFolder("staging-dir");
        SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
        // validate we get the expected results back
        List<Row> expected = spark.table("parquet_table").select("id", "struct.nested_1", "struct.nested_3", "data").collectAsList();
        List<Row> actual = spark.read().format("iceberg").load(tableLocation).select("id", "struct.nested_1", "struct.nested_3", "data").collectAsList();
        Assert.assertEquals("Rows must match", expected, actual);
        // validate we persisted correct metrics
        Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
        List<Row> bounds = fileDF.select("lower_bounds", "upper_bounds").collectAsList();
        Assert.assertEquals("Must have lower bounds for 2 columns", 2, bounds.get(0).getMap(0).size());
        Assert.assertEquals("Must have upper bounds for 2 columns", 2, bounds.get(0).getMap(1).size());
        Types.NestedField nestedField1 = table.schema().findField("struct.nested_1");
        checkFieldMetrics(fileDF, nestedField1, true);
        Types.NestedField id = table.schema().findField("id");
        checkFieldMetrics(fileDF, id, 1L, 2L);
        Types.NestedField nestedField3 = table.schema().findField("struct.nested_3");
        checkFieldMetrics(fileDF, nestedField3, "f", "g");
    } finally {
        spark.sql("DROP TABLE parquet_table");
    }
}
Also used : TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) Types(org.apache.iceberg.types.Types) Table(org.apache.iceberg.Table) NameMapping(org.apache.iceberg.mapping.NameMapping) Schema(org.apache.iceberg.Schema) Row(org.apache.spark.sql.Row) File(java.io.File) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 14 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestSparkTableUtilWithInMemoryCatalog method testImportTableWithInt96Timestamp.

@Test
public void testImportTableWithInt96Timestamp() throws IOException {
    File parquetTableDir = temp.newFolder("parquet_table");
    String parquetTableLocation = parquetTableDir.toURI().toString();
    try {
        spark.conf().set(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE().key(), "INT96");
        Column timestampColumn = functions.to_timestamp(functions.lit("2010-03-20 10:40:30.1234"));
        Dataset<Row> df = spark.range(1, 10).withColumn("tmp_col", timestampColumn);
        df.coalesce(1).select("id", "tmp_col").write().format("parquet").mode("append").option("path", parquetTableLocation).saveAsTable("parquet_table");
        Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "tmp_col", Types.TimestampType.withZone()));
        Table table = TABLES.create(schema, PartitionSpec.unpartitioned(), tableLocation);
        // assign a custom metrics config and disable vectorized reads
        table.updateProperties().set(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full").set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "false").commit();
        File stagingDir = temp.newFolder("staging-dir");
        SparkTableUtil.importSparkTable(spark, new TableIdentifier("parquet_table"), table, stagingDir.toString());
        // validate we get the expected results back
        List<Row> expected = spark.table("parquet_table").select("id", "tmp_col").collectAsList();
        List<Row> actual = spark.read().format("iceberg").load(tableLocation).select("id", "tmp_col").collectAsList();
        Assert.assertEquals("Rows must match", expected, actual);
        // validate we did not persist metrics for INT96
        Dataset<Row> fileDF = spark.read().format("iceberg").load(tableLocation + "#files");
        Types.NestedField timestampField = table.schema().findField("tmp_col");
        checkFieldMetrics(fileDF, timestampField, true);
        Types.NestedField idField = table.schema().findField("id");
        checkFieldMetrics(fileDF, idField, 1L, 9L);
    } finally {
        spark.sql("DROP TABLE parquet_table");
    }
}
Also used : TableIdentifier(org.apache.spark.sql.catalyst.TableIdentifier) Types(org.apache.iceberg.types.Types) Table(org.apache.iceberg.Table) Column(org.apache.spark.sql.Column) Schema(org.apache.iceberg.Schema) Row(org.apache.spark.sql.Row) File(java.io.File) Test(org.junit.Test)

Example 15 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestIcebergExpressions method testTruncateExpressions.

@Test
public void testTruncateExpressions() {
    sql("CREATE TABLE %s ( " + "  int_c INT, long_c LONG, dec_c DECIMAL(4, 2), str_c STRING, binary_c BINARY " + ") USING iceberg", tableName);
    sql("CREATE TEMPORARY VIEW emp " + "AS SELECT * FROM VALUES (101, 10001, 10.65, '101-Employee', CAST('1234' AS BINARY)) " + "AS EMP(int_c, long_c, dec_c, str_c, binary_c)");
    sql("INSERT INTO %s SELECT * FROM emp", tableName);
    Dataset<Row> df = spark.sql("SELECT * FROM " + tableName);
    df.select(new Column(new IcebergTruncateTransform(df.col("int_c").expr(), 2)).as("int_c"), new Column(new IcebergTruncateTransform(df.col("long_c").expr(), 2)).as("long_c"), new Column(new IcebergTruncateTransform(df.col("dec_c").expr(), 50)).as("dec_c"), new Column(new IcebergTruncateTransform(df.col("str_c").expr(), 2)).as("str_c"), new Column(new IcebergTruncateTransform(df.col("binary_c").expr(), 2)).as("binary_c")).createOrReplaceTempView("v");
    assertEquals("Should have expected rows", ImmutableList.of(row(100, 10000L, new BigDecimal("10.50"), "10", "12")), sql("SELECT int_c, long_c, dec_c, str_c, CAST(binary_c AS STRING) FROM v"));
}
Also used : IcebergTruncateTransform(org.apache.spark.sql.catalyst.expressions.IcebergTruncateTransform) Column(org.apache.spark.sql.Column) Row(org.apache.spark.sql.Row) BigDecimal(java.math.BigDecimal) Test(org.junit.Test)

Aggregations

Row (org.apache.spark.sql.Row)1045 Test (org.junit.Test)344 ArrayList (java.util.ArrayList)244 SparkSession (org.apache.spark.sql.SparkSession)243 StructType (org.apache.spark.sql.types.StructType)215 Test (org.junit.jupiter.api.Test)157 StructField (org.apache.spark.sql.types.StructField)138 Table (org.apache.iceberg.Table)127 Dataset (org.apache.spark.sql.Dataset)123 List (java.util.List)115 Script (org.apache.sysml.api.mlcontext.Script)104 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)101 IOException (java.io.IOException)78 Column (org.apache.spark.sql.Column)78 File (java.io.File)76 Collectors (java.util.stream.Collectors)73 PartitionSpec (org.apache.iceberg.PartitionSpec)70 DatasetBuilder (au.csiro.pathling.test.builders.DatasetBuilder)66 Map (java.util.Map)66 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)61