Search in sources :

Example 96 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestIcebergSourceTablesBase method testSnapshotReadAfterAddColumn.

@Test
public synchronized void testSnapshotReadAfterAddColumn() {
    TableIdentifier tableIdentifier = TableIdentifier.of("db", "table");
    Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());
    List<Row> originalRecords = Lists.newArrayList(RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z"));
    StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA);
    Dataset<Row> inputDf = spark.createDataFrame(originalRecords, originalSparkSchema);
    inputDf.select("id", "data").write().format("iceberg").mode(SaveMode.Append).save(loadLocation(tableIdentifier));
    table.refresh();
    Dataset<Row> resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier));
    Assert.assertEquals("Records should match", originalRecords, resultDf.orderBy("id").collectAsList());
    Snapshot snapshotBeforeAddColumn = table.currentSnapshot();
    table.updateSchema().addColumn("category", Types.StringType.get()).commit();
    List<Row> newRecords = Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C"));
    StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA2);
    Dataset<Row> inputDf2 = spark.createDataFrame(newRecords, newSparkSchema);
    inputDf2.select("id", "data", "category").write().format("iceberg").mode(SaveMode.Append).save(loadLocation(tableIdentifier));
    table.refresh();
    List<Row> updatedRecords = Lists.newArrayList(RowFactory.create(1, "x", null), RowFactory.create(2, "y", null), RowFactory.create(3, "z", null), RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C"));
    Dataset<Row> resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier));
    Assert.assertEquals("Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList());
    Dataset<Row> resultDf3 = spark.read().format("iceberg").option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()).load(loadLocation(tableIdentifier));
    Assert.assertEquals("Records should match", originalRecords, resultDf3.orderBy("id").collectAsList());
    Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema());
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) StructType(org.apache.spark.sql.types.StructType) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 97 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestIcebergSourceTablesBase method testPrunedSnapshotsTable.

@Test
public void testPrunedSnapshotsTable() {
    TableIdentifier tableIdentifier = TableIdentifier.of("db", "snapshots_test");
    Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "1"));
    Dataset<Row> inputDf = spark.createDataFrame(records, SimpleRecord.class);
    inputDf.select("id", "data").write().format("iceberg").mode("append").save(loadLocation(tableIdentifier));
    table.refresh();
    long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis();
    long firstSnapshotId = table.currentSnapshot().snapshotId();
    table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();
    long secondSnapshotTimestamp = table.currentSnapshot().timestampMillis();
    // rollback the table state to the first snapshot
    table.rollback().toSnapshotId(firstSnapshotId).commit();
    Dataset<Row> actualDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier, "snapshots")).select("operation", "committed_at", "summary", "parent_id");
    Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema());
    List<Row> actual = actualDf.collectAsList();
    GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots"));
    List<GenericData.Record> expected = Lists.newArrayList(builder.set("committed_at", firstSnapshotTimestamp * 1000).set("parent_id", null).set("operation", "append").set("summary", ImmutableMap.of("added-records", "1", "added-data-files", "1", "changed-partition-count", "1", "total-data-files", "1", "total-records", "1")).build(), builder.set("committed_at", secondSnapshotTimestamp * 1000).set("parent_id", firstSnapshotId).set("operation", "delete").set("summary", ImmutableMap.of("deleted-records", "1", "deleted-data-files", "1", "changed-partition-count", "1", "total-records", "0", "total-data-files", "0")).build());
    Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size());
    TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0));
    TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(1), actual.get(1));
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 98 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestIcebergSourceTablesBase method testFilesTable.

@Test
public void testFilesTable() throws Exception {
    TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_test");
    Table table = createTable(tableIdentifier, SCHEMA, SPEC);
    Table entriesTable = loadTable(tableIdentifier, "entries");
    Table filesTable = loadTable(tableIdentifier, "files");
    Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class);
    Dataset<Row> df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class);
    df1.select("id", "data").write().format("iceberg").mode("append").save(loadLocation(tableIdentifier));
    // add a second file
    df2.select("id", "data").write().format("iceberg").mode("append").save(loadLocation(tableIdentifier));
    // delete the first file to test that only live files are listed
    table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit();
    List<Row> actual = spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList();
    List<GenericData.Record> expected = Lists.newArrayList();
    for (ManifestFile manifest : table.currentSnapshot().dataManifests()) {
        InputFile in = table.io().newInputFile(manifest.path());
        try (CloseableIterable<GenericData.Record> rows = Avro.read(in).project(entriesTable.schema()).build()) {
            for (GenericData.Record record : rows) {
                if ((Integer) record.get("status") < 2) /* added or existing */
                {
                    GenericData.Record file = (GenericData.Record) record.get("data_file");
                    asMetadataRecord(file);
                    expected.add(file);
                }
            }
        }
    }
    Assert.assertEquals("Files table should have one row", 1, expected.size());
    Assert.assertEquals("Actual results should have one row", 1, actual.size());
    TestHelpers.assertEqualsSafe(filesTable.schema().asStruct(), expected.get(0), actual.get(0));
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) GenericData(org.apache.avro.generic.GenericData) ManifestFile(org.apache.iceberg.ManifestFile) InputFile(org.apache.iceberg.io.InputFile) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 99 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestDeleteFrom method testDeleteFromUnpartitionedTable.

@Test
public void testDeleteFromUnpartitionedTable() throws NoSuchTableException {
    sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName);
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
    Dataset<Row> df = spark.createDataFrame(records, SimpleRecord.class);
    df.coalesce(1).writeTo(tableName).append();
    assertEquals("Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName));
    AssertHelpers.assertThrows("Should not delete when not all rows of a file match the filter", AnalysisException.class, "Cannot delete from", () -> sql("DELETE FROM %s WHERE id < 2", tableName));
    sql("DELETE FROM %s WHERE id < 4", tableName);
    assertEquals("Should have no rows after successful delete", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName));
}
Also used : SimpleRecord(org.apache.iceberg.spark.source.SimpleRecord) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 100 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestUnpartitionedWrites method testDataFrameV2Append.

@Test
public void testDataFrameV2Append() throws NoSuchTableException {
    Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName));
    List<SimpleRecord> data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e"));
    Dataset<Row> ds = spark.createDataFrame(data, SimpleRecord.class);
    ds.writeTo(tableName).append();
    Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName));
    List<Object[]> expected = ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e"));
    assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName));
}
Also used : SimpleRecord(org.apache.iceberg.spark.source.SimpleRecord) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Aggregations

Row (org.apache.spark.sql.Row)1045 Test (org.junit.Test)344 ArrayList (java.util.ArrayList)244 SparkSession (org.apache.spark.sql.SparkSession)243 StructType (org.apache.spark.sql.types.StructType)215 Test (org.junit.jupiter.api.Test)157 StructField (org.apache.spark.sql.types.StructField)138 Table (org.apache.iceberg.Table)127 Dataset (org.apache.spark.sql.Dataset)123 List (java.util.List)115 Script (org.apache.sysml.api.mlcontext.Script)104 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)101 IOException (java.io.IOException)78 Column (org.apache.spark.sql.Column)78 File (java.io.File)76 Collectors (java.util.stream.Collectors)73 PartitionSpec (org.apache.iceberg.PartitionSpec)70 DatasetBuilder (au.csiro.pathling.test.builders.DatasetBuilder)66 Map (java.util.Map)66 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)61