use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestIcebergSourceTablesBase method testSnapshotReadAfterAddColumn.
@Test
public synchronized void testSnapshotReadAfterAddColumn() {
TableIdentifier tableIdentifier = TableIdentifier.of("db", "table");
Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());
List<Row> originalRecords = Lists.newArrayList(RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z"));
StructType originalSparkSchema = SparkSchemaUtil.convert(SCHEMA);
Dataset<Row> inputDf = spark.createDataFrame(originalRecords, originalSparkSchema);
inputDf.select("id", "data").write().format("iceberg").mode(SaveMode.Append).save(loadLocation(tableIdentifier));
table.refresh();
Dataset<Row> resultDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier));
Assert.assertEquals("Records should match", originalRecords, resultDf.orderBy("id").collectAsList());
Snapshot snapshotBeforeAddColumn = table.currentSnapshot();
table.updateSchema().addColumn("category", Types.StringType.get()).commit();
List<Row> newRecords = Lists.newArrayList(RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C"));
StructType newSparkSchema = SparkSchemaUtil.convert(SCHEMA2);
Dataset<Row> inputDf2 = spark.createDataFrame(newRecords, newSparkSchema);
inputDf2.select("id", "data", "category").write().format("iceberg").mode(SaveMode.Append).save(loadLocation(tableIdentifier));
table.refresh();
List<Row> updatedRecords = Lists.newArrayList(RowFactory.create(1, "x", null), RowFactory.create(2, "y", null), RowFactory.create(3, "z", null), RowFactory.create(4, "xy", "B"), RowFactory.create(5, "xyz", "C"));
Dataset<Row> resultDf2 = spark.read().format("iceberg").load(loadLocation(tableIdentifier));
Assert.assertEquals("Records should match", updatedRecords, resultDf2.orderBy("id").collectAsList());
Dataset<Row> resultDf3 = spark.read().format("iceberg").option(SparkReadOptions.SNAPSHOT_ID, snapshotBeforeAddColumn.snapshotId()).load(loadLocation(tableIdentifier));
Assert.assertEquals("Records should match", originalRecords, resultDf3.orderBy("id").collectAsList());
Assert.assertEquals("Schemas should match", originalSparkSchema, resultDf3.schema());
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestIcebergSourceTablesBase method testPrunedSnapshotsTable.
@Test
public void testPrunedSnapshotsTable() {
TableIdentifier tableIdentifier = TableIdentifier.of("db", "snapshots_test");
Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned());
List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "1"));
Dataset<Row> inputDf = spark.createDataFrame(records, SimpleRecord.class);
inputDf.select("id", "data").write().format("iceberg").mode("append").save(loadLocation(tableIdentifier));
table.refresh();
long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis();
long firstSnapshotId = table.currentSnapshot().snapshotId();
table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit();
long secondSnapshotTimestamp = table.currentSnapshot().timestampMillis();
// rollback the table state to the first snapshot
table.rollback().toSnapshotId(firstSnapshotId).commit();
Dataset<Row> actualDf = spark.read().format("iceberg").load(loadLocation(tableIdentifier, "snapshots")).select("operation", "committed_at", "summary", "parent_id");
Schema projectedSchema = SparkSchemaUtil.convert(actualDf.schema());
List<Row> actual = actualDf.collectAsList();
GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(projectedSchema, "snapshots"));
List<GenericData.Record> expected = Lists.newArrayList(builder.set("committed_at", firstSnapshotTimestamp * 1000).set("parent_id", null).set("operation", "append").set("summary", ImmutableMap.of("added-records", "1", "added-data-files", "1", "changed-partition-count", "1", "total-data-files", "1", "total-records", "1")).build(), builder.set("committed_at", secondSnapshotTimestamp * 1000).set("parent_id", firstSnapshotId).set("operation", "delete").set("summary", ImmutableMap.of("deleted-records", "1", "deleted-data-files", "1", "changed-partition-count", "1", "total-records", "0", "total-data-files", "0")).build());
Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size());
TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(0), actual.get(0));
TestHelpers.assertEqualsSafe(projectedSchema.asStruct(), expected.get(1), actual.get(1));
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestIcebergSourceTablesBase method testFilesTable.
@Test
public void testFilesTable() throws Exception {
TableIdentifier tableIdentifier = TableIdentifier.of("db", "files_test");
Table table = createTable(tableIdentifier, SCHEMA, SPEC);
Table entriesTable = loadTable(tableIdentifier, "entries");
Table filesTable = loadTable(tableIdentifier, "files");
Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class);
Dataset<Row> df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class);
df1.select("id", "data").write().format("iceberg").mode("append").save(loadLocation(tableIdentifier));
// add a second file
df2.select("id", "data").write().format("iceberg").mode("append").save(loadLocation(tableIdentifier));
// delete the first file to test that only live files are listed
table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit();
List<Row> actual = spark.read().format("iceberg").load(loadLocation(tableIdentifier, "files")).collectAsList();
List<GenericData.Record> expected = Lists.newArrayList();
for (ManifestFile manifest : table.currentSnapshot().dataManifests()) {
InputFile in = table.io().newInputFile(manifest.path());
try (CloseableIterable<GenericData.Record> rows = Avro.read(in).project(entriesTable.schema()).build()) {
for (GenericData.Record record : rows) {
if ((Integer) record.get("status") < 2) /* added or existing */
{
GenericData.Record file = (GenericData.Record) record.get("data_file");
asMetadataRecord(file);
expected.add(file);
}
}
}
}
Assert.assertEquals("Files table should have one row", 1, expected.size());
Assert.assertEquals("Actual results should have one row", 1, actual.size());
TestHelpers.assertEqualsSafe(filesTable.schema().asStruct(), expected.get(0), actual.get(0));
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestDeleteFrom method testDeleteFromUnpartitionedTable.
@Test
public void testDeleteFromUnpartitionedTable() throws NoSuchTableException {
sql("CREATE TABLE %s (id bigint, data string) USING iceberg", tableName);
List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"));
Dataset<Row> df = spark.createDataFrame(records, SimpleRecord.class);
df.coalesce(1).writeTo(tableName).append();
assertEquals("Should have expected rows", ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c")), sql("SELECT * FROM %s ORDER BY id", tableName));
AssertHelpers.assertThrows("Should not delete when not all rows of a file match the filter", AnalysisException.class, "Cannot delete from", () -> sql("DELETE FROM %s WHERE id < 2", tableName));
sql("DELETE FROM %s WHERE id < 4", tableName);
assertEquals("Should have no rows after successful delete", ImmutableList.of(), sql("SELECT * FROM %s ORDER BY id", tableName));
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestUnpartitionedWrites method testDataFrameV2Append.
@Test
public void testDataFrameV2Append() throws NoSuchTableException {
Assert.assertEquals("Should have 3 rows", 3L, scalarSql("SELECT count(*) FROM %s", tableName));
List<SimpleRecord> data = ImmutableList.of(new SimpleRecord(4, "d"), new SimpleRecord(5, "e"));
Dataset<Row> ds = spark.createDataFrame(data, SimpleRecord.class);
ds.writeTo(tableName).append();
Assert.assertEquals("Should have 5 rows after insert", 5L, scalarSql("SELECT count(*) FROM %s", tableName));
List<Object[]> expected = ImmutableList.of(row(1L, "a"), row(2L, "b"), row(3L, "c"), row(4L, "d"), row(5L, "e"));
assertEquals("Row data should match expected", expected, sql("SELECT * FROM %s ORDER BY id", tableName));
}
Aggregations