use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestRemoveOrphanFilesProcedure method testRemoveOrphanFilesWithDeleteFiles.
@Test
public void testRemoveOrphanFilesWithDeleteFiles() throws Exception {
sql("CREATE TABLE %s (id int, data string) USING iceberg TBLPROPERTIES" + "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName);
List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"), new SimpleRecord(4, "d"));
spark.createDataset(records, Encoders.bean(SimpleRecord.class)).coalesce(1).writeTo(tableName).append();
sql("DELETE FROM %s WHERE id=1", tableName);
Table table = Spark3Util.loadIcebergTable(spark, tableName);
Assert.assertEquals("Should have 1 delete manifest", 1, TestHelpers.deleteManifests(table).size());
Assert.assertEquals("Should have 1 delete file", 1, TestHelpers.deleteFiles(table).size());
Path deleteManifestPath = new Path(TestHelpers.deleteManifests(table).iterator().next().path());
Path deleteFilePath = new Path(String.valueOf(TestHelpers.deleteFiles(table).iterator().next().path()));
// wait to ensure files are old enough
waitUntilAfter(System.currentTimeMillis());
Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis()));
// delete orphans
List<Object[]> output = sql("CALL %s.system.remove_orphan_files(" + "table => '%s'," + "older_than => TIMESTAMP '%s')", catalogName, tableIdent, currentTimestamp);
Assert.assertEquals("Should be no orphan files", 0, output.size());
FileSystem localFs = FileSystem.getLocal(new Configuration());
Assert.assertTrue("Delete manifest should still exist", localFs.exists(deleteManifestPath));
Assert.assertTrue("Delete file should still exist", localFs.exists(deleteFilePath));
records.remove(new SimpleRecord(1, "a"));
Dataset<Row> resultDF = spark.read().format("iceberg").load(tableName);
List<SimpleRecord> actualRecords = resultDF.as(Encoders.bean(SimpleRecord.class)).collectAsList();
Assert.assertEquals("Rows must match", records, actualRecords);
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestRequiredDistributionAndOrdering method testRangeDistributionWithQuotedColumnNames.
@Test
public void testRangeDistributionWithQuotedColumnNames() throws NoSuchTableException {
sql("CREATE TABLE %s (`c.1` INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, `c.1`))", tableName);
List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
Dataset<Row> inputDF = ds.selectExpr("c1 as `c.1`", "c2", "c3").coalesce(1).sortWithinPartitions("`c.1`");
sql("ALTER TABLE %s WRITE ORDERED BY `c.1`, c2", tableName);
inputDF.writeTo(tableName).append();
assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestRequiredDistributionAndOrdering method testPartitionColumnsArePrependedForRangeDistribution.
@Test
public void testPartitionColumnsArePrependedForRangeDistribution() throws NoSuchTableException {
sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, c1))", tableName);
List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
Dataset<Row> inputDF = ds.coalesce(1).sortWithinPartitions("c1");
// should automatically prepend partition columns to the ordering
sql("ALTER TABLE %s WRITE ORDERED BY c1, c2", tableName);
inputDF.writeTo(tableName).append();
assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestRollbackToSnapshotProcedure method testRollbackToSnapshotRefreshesRelationCache.
@Test
public void testRollbackToSnapshotRefreshesRelationCache() {
sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
Table table = validationCatalog.loadTable(tableIdent);
Snapshot firstSnapshot = table.currentSnapshot();
sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
table.refresh();
Snapshot secondSnapshot = table.currentSnapshot();
Dataset<Row> query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1");
query.createOrReplaceTempView("tmp");
spark.sql("CACHE TABLE tmp");
assertEquals("View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp"));
List<Object[]> output = sql("CALL %s.system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)", catalogName, tableIdent, firstSnapshot.snapshotId());
assertEquals("Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output);
assertEquals("View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp"));
sql("UNCACHE TABLE tmp");
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestRollbackToTimestampProcedure method testRollbackToTimestampRefreshesRelationCache.
@Test
public void testRollbackToTimestampRefreshesRelationCache() {
sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
Table table = validationCatalog.loadTable(tableIdent);
Snapshot firstSnapshot = table.currentSnapshot();
String firstSnapshotTimestamp = LocalDateTime.now().toString();
waitUntilAfter(firstSnapshot.timestampMillis());
sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
table.refresh();
Snapshot secondSnapshot = table.currentSnapshot();
Dataset<Row> query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1");
query.createOrReplaceTempView("tmp");
spark.sql("CACHE TABLE tmp");
assertEquals("View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp"));
List<Object[]> output = sql("CALL %s.system.rollback_to_timestamp(table => '%s', timestamp => TIMESTAMP '%s')", catalogName, tableIdent, firstSnapshotTimestamp);
assertEquals("Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output);
assertEquals("View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp"));
sql("UNCACHE TABLE tmp");
}
Aggregations