Search in sources :

Example 16 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestCherrypickSnapshotProcedure method testCherrypickSnapshotRefreshesRelationCache.

@Test
public void testCherrypickSnapshotRefreshesRelationCache() {
    sql("CREATE TABLE %s (id bigint NOT NULL, data string) USING iceberg", tableName);
    sql("ALTER TABLE %s SET TBLPROPERTIES ('%s' 'true')", tableName, WRITE_AUDIT_PUBLISH_ENABLED);
    Dataset<Row> query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1");
    query.createOrReplaceTempView("tmp");
    spark.sql("CACHE TABLE tmp");
    assertEquals("View should not produce rows", ImmutableList.of(), sql("SELECT * FROM tmp"));
    spark.conf().set("spark.wap.id", "1");
    sql("INSERT INTO TABLE %s VALUES (1, 'a')", tableName);
    assertEquals("Should not see rows from staged snapshot", ImmutableList.of(), sql("SELECT * FROM %s", tableName));
    Table table = validationCatalog.loadTable(tableIdent);
    Snapshot wapSnapshot = Iterables.getOnlyElement(table.snapshots());
    sql("CALL %s.system.cherrypick_snapshot('%s', %dL)", catalogName, tableIdent, wapSnapshot.snapshotId());
    assertEquals("Cherrypick snapshot should be visible", ImmutableList.of(row(1L, "a")), sql("SELECT * FROM tmp"));
    sql("UNCACHE TABLE tmp");
}
Also used : Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 17 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestConflictValidation method testOverwritePartitionSnapshotIsolation2.

@Test
public void testOverwritePartitionSnapshotIsolation2() throws Exception {
    Table table = validationCatalog.loadTable(tableIdent);
    final long snapshotId = table.currentSnapshot().snapshotId();
    // This should delete a data file
    sql("DELETE FROM %s WHERE id='1'", tableName);
    // Validating from previous snapshot finds conflicts
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"));
    spark.createDataFrame(records, SimpleRecord.class).coalesce(1).writeTo(tableName).append();
    Dataset<Row> conflictingDf = spark.createDataFrame(records, SimpleRecord.class);
    AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", ValidationException.class, "Found conflicting deleted files that can apply to records matching [id=1]", () -> {
        try {
            conflictingDf.writeTo(tableName).option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)).option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()).overwritePartitions();
        } catch (NoSuchTableException e) {
            throw new RuntimeException(e);
        }
    });
    // Validating from latest snapshot should succeed
    table.refresh();
    long newSnapshotId = table.currentSnapshot().snapshotId();
    conflictingDf.writeTo(tableName).option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)).option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()).overwritePartitions();
}
Also used : Table(org.apache.iceberg.Table) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) SimpleRecord(org.apache.iceberg.spark.source.SimpleRecord) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 18 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestConflictValidation method testOverwritePartitionSerializableIsolation.

@Test
public void testOverwritePartitionSerializableIsolation() throws Exception {
    Table table = validationCatalog.loadTable(tableIdent);
    final long snapshotId = table.currentSnapshot().snapshotId();
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"));
    spark.createDataFrame(records, SimpleRecord.class).writeTo(tableName).append();
    // Validating from previous snapshot finds conflicts
    Dataset<Row> conflictingDf = spark.createDataFrame(records, SimpleRecord.class);
    AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", ValidationException.class, "Found conflicting files that can contain records matching partitions [id=1]", () -> {
        try {
            conflictingDf.writeTo(tableName).option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)).option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()).overwritePartitions();
        } catch (NoSuchTableException e) {
            throw new RuntimeException(e);
        }
    });
    // Validating from latest snapshot should succeed
    table.refresh();
    long newSnapshotId = table.currentSnapshot().snapshotId();
    conflictingDf.writeTo(tableName).option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)).option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SERIALIZABLE.toString()).overwritePartitions();
}
Also used : Table(org.apache.iceberg.Table) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) SimpleRecord(org.apache.iceberg.spark.source.SimpleRecord) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 19 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestConflictValidation method testOverwritePartitionSnapshotIsolation.

@Test
public void testOverwritePartitionSnapshotIsolation() throws Exception {
    List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "a"), new SimpleRecord(1, "b"));
    spark.createDataFrame(records, SimpleRecord.class).coalesce(1).writeTo(tableName).append();
    Table table = validationCatalog.loadTable(tableIdent);
    final long snapshotId = table.currentSnapshot().snapshotId();
    // This should generate a delete file
    sql("DELETE FROM %s WHERE data='a'", tableName);
    // Validating from previous snapshot finds conflicts
    Dataset<Row> conflictingDf = spark.createDataFrame(records, SimpleRecord.class);
    AssertHelpers.assertThrowsCause("Conflicting deleted data files should throw exception", ValidationException.class, "Found new conflicting delete files that can apply to records matching [id=1]", () -> {
        try {
            conflictingDf.writeTo(tableName).option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(snapshotId)).option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()).overwritePartitions();
        } catch (NoSuchTableException e) {
            throw new RuntimeException(e);
        }
    });
    // Validating from latest snapshot should succeed
    table.refresh();
    long newSnapshotId = table.currentSnapshot().snapshotId();
    conflictingDf.writeTo(tableName).option(SparkWriteOptions.VALIDATE_FROM_SNAPSHOT_ID, String.valueOf(newSnapshotId)).option(SparkWriteOptions.ISOLATION_LEVEL, IsolationLevel.SNAPSHOT.toString()).overwritePartitions();
}
Also used : Table(org.apache.iceberg.Table) NoSuchTableException(org.apache.spark.sql.catalyst.analysis.NoSuchTableException) SimpleRecord(org.apache.iceberg.spark.source.SimpleRecord) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 20 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestUpdate method testUpdateRefreshesRelationCache.

@Test
public void testUpdateRefreshesRelationCache() {
    createAndInitTable("id INT, dep STRING");
    sql("ALTER TABLE %s ADD PARTITION FIELD dep", tableName);
    append(tableName, "{ \"id\": 1, \"dep\": \"hr\" }\n" + "{ \"id\": 3, \"dep\": \"hr\" }");
    append(tableName, "{ \"id\": 1, \"dep\": \"hardware\" }\n" + "{ \"id\": 2, \"dep\": \"hardware\" }");
    Dataset<Row> query = spark.sql("SELECT * FROM " + tableName + " WHERE id = 1");
    query.createOrReplaceTempView("tmp");
    spark.sql("CACHE TABLE tmp");
    assertEquals("View should have correct data", ImmutableList.of(row(1, "hardware"), row(1, "hr")), sql("SELECT * FROM tmp ORDER BY id, dep"));
    sql("UPDATE %s SET id = -1 WHERE id = 1", tableName);
    Table table = validationCatalog.loadTable(tableIdent);
    Assert.assertEquals("Should have 3 snapshots", 3, Iterables.size(table.snapshots()));
    Snapshot currentSnapshot = table.currentSnapshot();
    if (mode(table) == COPY_ON_WRITE) {
        validateCopyOnWrite(currentSnapshot, "2", "2", "2");
    } else {
        validateMergeOnRead(currentSnapshot, "2", "2", "2");
    }
    assertEquals("Should have expected rows", ImmutableList.of(row(-1, "hardware"), row(-1, "hr"), row(2, "hardware"), row(3, "hr")), sql("SELECT * FROM %s ORDER BY id, dep", tableName));
    assertEquals("Should refresh the relation cache", ImmutableList.of(), sql("SELECT * FROM tmp ORDER BY id, dep"));
    spark.sql("UNCACHE TABLE tmp");
}
Also used : Snapshot(org.apache.iceberg.Snapshot) Table(org.apache.iceberg.Table) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Aggregations

Row (org.apache.spark.sql.Row)1045 Test (org.junit.Test)344 ArrayList (java.util.ArrayList)244 SparkSession (org.apache.spark.sql.SparkSession)243 StructType (org.apache.spark.sql.types.StructType)215 Test (org.junit.jupiter.api.Test)157 StructField (org.apache.spark.sql.types.StructField)138 Table (org.apache.iceberg.Table)127 Dataset (org.apache.spark.sql.Dataset)123 List (java.util.List)115 Script (org.apache.sysml.api.mlcontext.Script)104 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)101 IOException (java.io.IOException)78 Column (org.apache.spark.sql.Column)78 File (java.io.File)76 Collectors (java.util.stream.Collectors)73 PartitionSpec (org.apache.iceberg.PartitionSpec)70 DatasetBuilder (au.csiro.pathling.test.builders.DatasetBuilder)66 Map (java.util.Map)66 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)61