Search in sources :

Example 21 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestRemoveOrphanFilesProcedure method testRemoveOrphanFilesWithDeleteFiles.

@Test
public void testRemoveOrphanFilesWithDeleteFiles() throws Exception {
    sql("CREATE TABLE %s (id int, data string) USING iceberg TBLPROPERTIES" + "('format-version'='2', 'write.delete.mode'='merge-on-read')", tableName);
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c"), new SimpleRecord(4, "d"));
    spark.createDataset(records, Encoders.bean(SimpleRecord.class)).coalesce(1).writeTo(tableName).append();
    sql("DELETE FROM %s WHERE id=1", tableName);
    Table table = Spark3Util.loadIcebergTable(spark, tableName);
    Assert.assertEquals("Should have 1 delete manifest", 1, TestHelpers.deleteManifests(table).size());
    Assert.assertEquals("Should have 1 delete file", 1, TestHelpers.deleteFiles(table).size());
    Path deleteManifestPath = new Path(TestHelpers.deleteManifests(table).iterator().next().path());
    Path deleteFilePath = new Path(String.valueOf(TestHelpers.deleteFiles(table).iterator().next().path()));
    // wait to ensure files are old enough
    waitUntilAfter(System.currentTimeMillis());
    Timestamp currentTimestamp = Timestamp.from(Instant.ofEpochMilli(System.currentTimeMillis()));
    // delete orphans
    List<Object[]> output = sql("CALL %s.system.remove_orphan_files(" + "table => '%s'," + "older_than => TIMESTAMP '%s')", catalogName, tableIdent, currentTimestamp);
    Assert.assertEquals("Should be no orphan files", 0, output.size());
    FileSystem localFs = FileSystem.getLocal(new Configuration());
    Assert.assertTrue("Delete manifest should still exist", localFs.exists(deleteManifestPath));
    Assert.assertTrue("Delete file should still exist", localFs.exists(deleteFilePath));
    records.remove(new SimpleRecord(1, "a"));
    Dataset<Row> resultDF = spark.read().format("iceberg").load(tableName);
    List<SimpleRecord> actualRecords = resultDF.as(Encoders.bean(SimpleRecord.class)).collectAsList();
    Assert.assertEquals("Rows must match", records, actualRecords);
}
Also used : Path(org.apache.hadoop.fs.Path) Table(org.apache.iceberg.Table) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) SimpleRecord(org.apache.iceberg.spark.source.SimpleRecord) Row(org.apache.spark.sql.Row) Timestamp(java.sql.Timestamp) Test(org.junit.Test)

Example 22 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestRequiredDistributionAndOrdering method testRangeDistributionWithQuotedColumnNames.

@Test
public void testRangeDistributionWithQuotedColumnNames() throws NoSuchTableException {
    sql("CREATE TABLE %s (`c.1` INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, `c.1`))", tableName);
    List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
    Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
    Dataset<Row> inputDF = ds.selectExpr("c1 as `c.1`", "c2", "c3").coalesce(1).sortWithinPartitions("`c.1`");
    sql("ALTER TABLE %s WRITE ORDERED BY `c.1`, c2", tableName);
    inputDF.writeTo(tableName).append();
    assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
}
Also used : Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Test(org.junit.Test)

Example 23 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestRequiredDistributionAndOrdering method testPartitionColumnsArePrependedForRangeDistribution.

@Test
public void testPartitionColumnsArePrependedForRangeDistribution() throws NoSuchTableException {
    sql("CREATE TABLE %s (c1 INT, c2 STRING, c3 STRING) " + "USING iceberg " + "PARTITIONED BY (bucket(2, c1))", tableName);
    List<ThreeColumnRecord> data = ImmutableList.of(new ThreeColumnRecord(1, null, "A"), new ThreeColumnRecord(2, "BBBBBBBBBB", "B"), new ThreeColumnRecord(3, "BBBBBBBBBB", "A"), new ThreeColumnRecord(4, "BBBBBBBBBB", "B"), new ThreeColumnRecord(5, "BBBBBBBBBB", "A"), new ThreeColumnRecord(6, "BBBBBBBBBB", "B"), new ThreeColumnRecord(7, "BBBBBBBBBB", "A"));
    Dataset<Row> ds = spark.createDataFrame(data, ThreeColumnRecord.class);
    Dataset<Row> inputDF = ds.coalesce(1).sortWithinPartitions("c1");
    // should automatically prepend partition columns to the ordering
    sql("ALTER TABLE %s WRITE ORDERED BY c1, c2", tableName);
    inputDF.writeTo(tableName).append();
    assertEquals("Row count must match", ImmutableList.of(row(7L)), sql("SELECT count(*) FROM %s", tableName));
}
Also used : Row(org.apache.spark.sql.Row) ThreeColumnRecord(org.apache.iceberg.spark.source.ThreeColumnRecord) Test(org.junit.Test)

Example 24 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestRollbackToSnapshotProcedure method testRollbackToSnapshotRefreshesRelationCache.

@Test
public void testRollbackToSnapshotRefreshesRelationCache() {
    sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
    sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
    Table table = validationCatalog.loadTable(tableIdent);
    Snapshot firstSnapshot = table.currentSnapshot();
    sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
    table.refresh();
    Snapshot secondSnapshot = table.currentSnapshot();
    Dataset<Row> query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1");
    query.createOrReplaceTempView("tmp");
    spark.sql("CACHE TABLE tmp");
    assertEquals("View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp"));
    List<Object[]> output = sql("CALL %s.system.rollback_to_snapshot(table => '%s', snapshot_id => %dL)", catalogName, tableIdent, firstSnapshot.snapshotId());
    assertEquals("Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output);
    assertEquals("View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp"));
    sql("UNCACHE TABLE tmp");
}
Also used : Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 25 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestRollbackToTimestampProcedure method testRollbackToTimestampRefreshesRelationCache.

@Test
public void testRollbackToTimestampRefreshesRelationCache() {
    sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
    sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
    Table table = validationCatalog.loadTable(tableIdent);
    Snapshot firstSnapshot = table.currentSnapshot();
    String firstSnapshotTimestamp = LocalDateTime.now().toString();
    waitUntilAfter(firstSnapshot.timestampMillis());
    sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
    table.refresh();
    Snapshot secondSnapshot = table.currentSnapshot();
    Dataset<Row> query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1");
    query.createOrReplaceTempView("tmp");
    spark.sql("CACHE TABLE tmp");
    assertEquals("View should have expected rows", ImmutableList.of(row(1L, "a"), row(1L, "a")), sql("SELECT * FROM tmp"));
    List<Object[]> output = sql("CALL %s.system.rollback_to_timestamp(table => '%s', timestamp => TIMESTAMP '%s')", catalogName, tableIdent, firstSnapshotTimestamp);
    assertEquals("Procedure output must match", ImmutableList.of(row(secondSnapshot.snapshotId(), firstSnapshot.snapshotId())), output);
    assertEquals("View cache must be invalidated", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp"));
    sql("UNCACHE TABLE tmp");
}
Also used : Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Aggregations

Row (org.apache.spark.sql.Row)1045 Test (org.junit.Test)344 ArrayList (java.util.ArrayList)244 SparkSession (org.apache.spark.sql.SparkSession)243 StructType (org.apache.spark.sql.types.StructType)215 Test (org.junit.jupiter.api.Test)157 StructField (org.apache.spark.sql.types.StructField)138 Table (org.apache.iceberg.Table)127 Dataset (org.apache.spark.sql.Dataset)123 List (java.util.List)115 Script (org.apache.sysml.api.mlcontext.Script)104 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)101 IOException (java.io.IOException)78 Column (org.apache.spark.sql.Column)78 File (java.io.File)76 Collectors (java.util.stream.Collectors)73 PartitionSpec (org.apache.iceberg.PartitionSpec)70 DatasetBuilder (au.csiro.pathling.test.builders.DatasetBuilder)66 Map (java.util.Map)66 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)61