Search in sources :

Example 21 with SparkTable

use of org.apache.iceberg.spark.source.SparkTable in project iceberg by apache.

the class TestCreateActions method removeColumnsAtEnd.

@Test
public void removeColumnsAtEnd() throws Exception {
    Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop"));
    Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog"));
    String source = sourceName("test_remove_column_migrated_table");
    String dest = source;
    String colName1 = "newCol1";
    String colName2 = "newCol2";
    File location = temp.newFolder();
    spark.range(10).selectExpr("cast(id as INT)", "CAST(id as INT) " + colName1, "CAST(id as INT) " + colName2).write().mode(SaveMode.Overwrite).saveAsTable(dest);
    List<Object[]> expected1 = sql("select id, %s from %s order by id", colName1, source);
    List<Object[]> expected2 = sql("select id from %s order by id", source);
    // migrate table
    SparkActions.get().migrateTable(source).execute();
    SparkTable sparkTable = loadTable(dest);
    Table table = sparkTable.table();
    // test column removal on migrated table
    Schema beforeSchema = table.schema();
    sparkTable.table().updateSchema().deleteColumn(colName1).commit();
    Schema afterSchema = table.schema();
    Assert.assertNotNull(beforeSchema.findField(colName1));
    Assert.assertNull(afterSchema.findField(colName1));
    // reads should succeed without any exceptions
    List<Object[]> results1 = sql("select * from %s order by id", dest);
    Assert.assertTrue(results1.size() > 0);
    assertEquals("Output must match", expected1, results1);
    sql("ALTER TABLE %s DROP COLUMN %s", dest, colName2);
    StructType schema = spark.table(dest).schema();
    Assert.assertFalse(Arrays.asList(schema.fieldNames()).contains(colName2));
    // reads should succeed without any exceptions
    List<Object[]> results2 = sql("select * from %s order by id", dest);
    Assert.assertTrue(results2.size() > 0);
    assertEquals("Output must match", expected2, results2);
}
Also used : CatalogTable(org.apache.spark.sql.catalyst.catalog.CatalogTable) SnapshotTable(org.apache.iceberg.actions.SnapshotTable) MigrateTable(org.apache.iceberg.actions.MigrateTable) Table(org.apache.iceberg.Table) SparkTable(org.apache.iceberg.spark.source.SparkTable) StructType(org.apache.spark.sql.types.StructType) Schema(org.apache.iceberg.Schema) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) SparkTable(org.apache.iceberg.spark.source.SparkTable) Test(org.junit.Test)

Example 22 with SparkTable

use of org.apache.iceberg.spark.source.SparkTable in project iceberg by apache.

the class TestCreateActions method schemaEvolutionTestWithSparkAPI.

@Test
public void schemaEvolutionTestWithSparkAPI() throws Exception {
    Assume.assumeTrue("Cannot migrate to a hadoop based catalog", !type.equals("hadoop"));
    Assume.assumeTrue("Can only migrate from Spark Session Catalog", catalog.name().equals("spark_catalog"));
    File location = temp.newFolder();
    String tblName = sourceName("schema_evolution_test");
    // Data generation and partition addition
    spark.range(0, 5).selectExpr("CAST(id as INT) as col0", "CAST(id AS FLOAT) col2", "CAST(id AS LONG) col3").write().mode(SaveMode.Append).parquet(location.toURI().toString());
    Dataset<Row> rowDataset = spark.range(6, 10).selectExpr("CAST(id as INT) as col0", "CAST(id AS STRING) col1", "CAST(id AS FLOAT) col2", "CAST(id AS LONG) col3");
    rowDataset.write().mode(SaveMode.Append).parquet(location.toURI().toString());
    spark.read().schema(rowDataset.schema()).parquet(location.toURI().toString()).write().saveAsTable(tblName);
    List<Object[]> expectedBeforeAddColumn = sql("SELECT * FROM %s ORDER BY col0", tblName);
    List<Object[]> expectedAfterAddColumn = sql("SELECT col0, null, col1, col2, col3 FROM %s ORDER BY col0", tblName);
    // Migrate table
    SparkActions.get().migrateTable(tblName).execute();
    // check if iceberg and non-iceberg output
    List<Object[]> afterMigarteBeforeAddResults = sql("SELECT * FROM %s ORDER BY col0", tblName);
    assertEquals("Output must match", expectedBeforeAddColumn, afterMigarteBeforeAddResults);
    // Update schema and check output correctness
    SparkTable sparkTable = loadTable(tblName);
    sparkTable.table().updateSchema().addColumn("newCol", Types.IntegerType.get()).moveAfter("newCol", "col0").commit();
    List<Object[]> afterMigarteAfterAddResults = sql("SELECT * FROM %s ORDER BY col0", tblName);
    assertEquals("Output must match", expectedAfterAddColumn, afterMigarteAfterAddResults);
}
Also used : Row(org.apache.spark.sql.Row) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) SparkTable(org.apache.iceberg.spark.source.SparkTable) Test(org.junit.Test)

Example 23 with SparkTable

use of org.apache.iceberg.spark.source.SparkTable in project OpenLineage by OpenLineage.

the class IcebergHandlerTest method testGetVersionString.

@Test
public void testGetVersionString() throws NoSuchTableException {
    SparkCatalog sparkCatalog = mock(SparkCatalog.class);
    SparkTable sparkTable = mock(SparkTable.class, RETURNS_DEEP_STUBS);
    Identifier identifier = Identifier.of(new String[] { "database", "schema" }, "table");
    when(sparkCatalog.loadTable(identifier)).thenReturn(sparkTable);
    when(sparkTable.table().currentSnapshot().snapshotId()).thenReturn(1500100900L);
    Optional<String> version = icebergHandler.getDatasetVersion(sparkCatalog, identifier, Collections.emptyMap());
    assertTrue(version.isPresent());
    assertEquals(version.get(), "1500100900");
}
Also used : DatasetIdentifier(io.openlineage.spark.agent.util.DatasetIdentifier) Identifier(org.apache.spark.sql.connector.catalog.Identifier) SparkCatalog(org.apache.iceberg.spark.SparkCatalog) SparkTable(org.apache.iceberg.spark.source.SparkTable) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

SparkTable (org.apache.iceberg.spark.source.SparkTable)23 Test (org.junit.Test)12 Identifier (org.apache.spark.sql.connector.catalog.Identifier)8 File (java.io.File)7 SparkCatalog (org.apache.iceberg.spark.SparkCatalog)7 Map (java.util.Map)6 Table (org.apache.iceberg.Table)6 StreamSupport (java.util.stream.StreamSupport)5 DeleteOrphanFiles (org.apache.iceberg.actions.DeleteOrphanFiles)5 Maps (org.apache.iceberg.relocated.com.google.common.collect.Maps)5 SparkSchemaUtil (org.apache.iceberg.spark.SparkSchemaUtil)5 SparkSessionCatalog (org.apache.iceberg.spark.SparkSessionCatalog)5 Transform (org.apache.spark.sql.connector.expressions.Transform)5 After (org.junit.After)5 Assert (org.junit.Assert)5 Schema (org.apache.iceberg.Schema)4 MigrateTable (org.apache.iceberg.actions.MigrateTable)3 SnapshotTable (org.apache.iceberg.actions.SnapshotTable)3 NoSuchTableException (org.apache.spark.sql.catalyst.analysis.NoSuchTableException)3 CatalogTable (org.apache.spark.sql.catalyst.catalog.CatalogTable)3