Search in sources :

Example 96 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergSchemaEvolution method testSchemaEvolutionOnVectorizedReads.

// Tests CHANGE COLUMN feature similarly like above, but with a more complex schema, aimed to verify vectorized
// reads support the feature properly, also combining with other schema changes e.g. ADD COLUMN
@Test
public void testSchemaEvolutionOnVectorizedReads() throws Exception {
    // Currently only ORC, but in the future this should run against each fileformat with vectorized read support.
    Assume.assumeTrue("Vectorized reads only.", isVectorized);
    Schema orderSchema = new Schema(optional(1, "order_id", Types.IntegerType.get()), optional(2, "customer_first_name", Types.StringType.get()), optional(3, "customer_last_name", Types.StringType.get()), optional(4, "quantity", Types.IntegerType.get()), optional(5, "price", Types.IntegerType.get()), optional(6, "item", Types.StringType.get()));
    List<Record> records = TestHelper.RecordsBuilder.newInstance(orderSchema).add(1, "Doctor", "Strange", 100, 3, "apple").add(2, "Tony", "Stark", 150, 2, "apple").add(3, "Tony", "Stark", 200, 6, "orange").add(4, "Steve", "Rogers", 100, 8, "banana").add(5, "Doctor", "Strange", 800, 7, "orange").add(6, "Thor", "Odinson", 650, 3, "apple").build();
    testTables.createTable(shell, "orders", orderSchema, fileFormat, records);
    // Reorder columns and rename one column
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN " + "customer_first_name customer_first_name string AFTER customer_last_name");
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN " + "quantity quantity int AFTER price");
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN " + "item fruit string");
    List<Object[]> result = shell.executeStatement("SELECT customer_first_name, customer_last_name, SUM(quantity) " + "FROM orders where price >= 3 group by customer_first_name, customer_last_name order by customer_first_name");
    assertQueryResult(result, 4, "Doctor", "Strange", 900L, "Steve", "Rogers", 100L, "Thor", "Odinson", 650L, "Tony", "Stark", 200L);
    // Adding a new column (will end up as last col of the schema)
    shell.executeStatement("ALTER TABLE orders ADD COLUMNS (nickname string)");
    shell.executeStatement("INSERT INTO orders VALUES (7, 'Romanoff', 'Natasha', 3, 250, 'apple', 'Black Widow')");
    result = shell.executeStatement("SELECT customer_first_name, customer_last_name, nickname, SUM(quantity) " + " FROM orders where price >= 3 group by customer_first_name, customer_last_name, nickname " + " order by customer_first_name");
    assertQueryResult(result, 5, "Doctor", "Strange", null, 900L, "Natasha", "Romanoff", "Black Widow", 250L, "Steve", "Rogers", null, 100L, "Thor", "Odinson", null, 650L, "Tony", "Stark", null, 200L);
    // Re-order newly added column (move it away from being the last column)
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN fruit fruit string AFTER nickname");
    result = shell.executeStatement("SELECT customer_first_name, customer_last_name, nickname, fruit, SUM(quantity) " + " FROM orders where price >= 3 and fruit < 'o' group by customer_first_name, customer_last_name, nickname, " + "fruit order by customer_first_name");
    assertQueryResult(result, 4, "Doctor", "Strange", null, "apple", 100L, "Natasha", "Romanoff", "Black Widow", "apple", 250L, "Steve", "Rogers", null, "banana", 100L, "Thor", "Odinson", null, "apple", 650L);
    // Rename newly added column (+ reading with different file includes)
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN nickname nick string");
    result = shell.executeStatement("SELECT customer_first_name, nick, SUM(quantity) " + " FROM orders where fruit < 'o'and nick IS NOT NULL group by customer_first_name, nick");
    assertQueryResult(result, 1, "Natasha", "Black Widow", 250L);
    // Re-order between different types
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN order_id order_id int AFTER customer_first_name");
    result = shell.executeStatement("SELECT customer_first_name, nick, SUM(quantity), MIN(order_id) " + " FROM orders where fruit < 'o'and nick IS NOT NULL group by customer_first_name, nick");
    assertQueryResult(result, 1, "Natasha", "Black Widow", 250L, 7);
    // Drop columns via REPLACE COLUMNS
    shell.executeStatement("ALTER TABLE orders REPLACE COLUMNS (" + "customer_last_name string COMMENT 'from deserializer', order_id int COMMENT 'from deserializer'," + " quantity int, nick string COMMENT 'from deserializer'," + " fruit string COMMENT 'from deserializer')");
    result = shell.executeStatement("DESCRIBE orders");
    assertQueryResult(result, 5, "customer_last_name", "string", "from deserializer", "order_id", "int", "from deserializer", "quantity", "int", "from deserializer", "nick", "string", "from deserializer", "fruit", "string", "from deserializer");
    result = shell.executeStatement("SELECT * FROM orders ORDER BY order_id");
    assertQueryResult(result, 7, "Strange", 1, 100, null, "apple", "Stark", 2, 150, null, "apple", "Stark", 3, 200, null, "orange", "Rogers", 4, 100, null, "banana", "Strange", 5, 800, null, "orange", "Odinson", 6, 650, null, "apple", "Romanoff", 7, 250, "Black Widow", "apple");
}
Also used : Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 97 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergSchemaEvolution method testMoveCustomerIdAfterFirstNameInIcebergTable.

@Test
public void testMoveCustomerIdAfterFirstNameInIcebergTable() throws IOException {
    // Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data.
    Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
    // Move the last_name column before the customer_id in the table schema.
    icebergTable.updateSchema().moveAfter("customer_id", "first_name").commit();
    Schema customerSchemaLastNameFirst = new Schema(optional(1, "first_name", Types.StringType.get(), "This is first name"), optional(2, "customer_id", Types.LongType.get()), optional(3, "last_name", Types.StringType.get(), "This is last name"));
    TestHelper.RecordsBuilder customersWithLastNameFirstBuilder = TestHelper.RecordsBuilder.newInstance(customerSchemaLastNameFirst).add("Alice", 0L, "Brown").add("Bob", 1L, "Green").add("Trudy", 2L, "Pink");
    List<Record> customersWithLastNameFirst = customersWithLastNameFirstBuilder.build();
    // Run a 'select *' to check if the order of the column in the result has been changed.
    List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithLastNameFirst, HiveIcebergTestUtils.valueForRow(customerSchemaLastNameFirst, rows), 1);
    // Query the data with names and check if the result is the same as when the table was created.
    rows = shell.executeStatement("SELECT customer_id, first_name, last_name FROM default.customers");
    HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, rows), 0);
    // Insert data from Hive to check if the last_name column has to be before the customer_id in the values list.
    shell.executeStatement("INSERT INTO default.customers values ('Lily', 3L, 'Magenta')");
    customersWithLastNameFirstBuilder.add("Lily", 3L, "Magenta");
    customersWithLastNameFirst = customersWithLastNameFirstBuilder.build();
    rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithLastNameFirst, HiveIcebergTestUtils.valueForRow(customerSchemaLastNameFirst, rows), 1);
}
Also used : TestHelper(org.apache.iceberg.mr.TestHelper) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 98 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergSchemaEvolution method testUpdateColumnTypeInIcebergTable.

@Test
public void testUpdateColumnTypeInIcebergTable() throws IOException, TException, InterruptedException {
    // Create an Iceberg table with int, float and decimal(2,1) types with some initial records
    Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "int_col", Types.IntegerType.get(), "This is an integer type"), optional(3, "float_col", Types.FloatType.get(), "This is a float type"), optional(4, "decimal_col", Types.DecimalType.of(2, 1), "This is a decimal type"));
    List<Record> records = TestHelper.RecordsBuilder.newInstance(schema).add(0L, 35, 22F, BigDecimal.valueOf(13L, 1)).add(1L, 223344, 555.22F, BigDecimal.valueOf(22L, 1)).add(2L, -234, -342F, BigDecimal.valueOf(-12L, 1)).build();
    Table icebergTable = testTables.createTable(shell, "types_table", schema, fileFormat, records);
    // In the result set a float column is returned as double and a decimal is returned as string,
    // even though Hive has the columns with the right types.
    // Probably this conversation happens when fetching the result set after calling the select through the shell.
    // Because of this, a separate schema and record list has to be used when validating the returned values.
    Schema schemaForResultSet = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "int_col", Types.IntegerType.get()), optional(3, "float_col", Types.DoubleType.get()), optional(4, "decimal_col", Types.StringType.get()));
    List<Record> expectedResults = TestHelper.RecordsBuilder.newInstance(schema).add(0L, 35, 22d, "1.3").add(1L, 223344, 555.22d, "2.2").add(2L, -234, -342d, "-1.2").build();
    // Check the select result and the column types of the table.
    List<Object[]> rows = shell.executeStatement("SELECT * FROM types_table");
    HiveIcebergTestUtils.validateData(expectedResults, HiveIcebergTestUtils.valueForRow(schemaForResultSet, rows), 0);
    org.apache.hadoop.hive.metastore.api.Table table = shell.metastore().getTable("default", "types_table");
    Assert.assertNotNull(table);
    Assert.assertNotNull(table.getSd());
    List<FieldSchema> columns = table.getSd().getCols();
    Assert.assertEquals("id", columns.get(0).getName());
    Assert.assertEquals("bigint", columns.get(0).getType());
    Assert.assertEquals("int_col", columns.get(1).getName());
    Assert.assertEquals("int", columns.get(1).getType());
    Assert.assertEquals("float_col", columns.get(2).getName());
    Assert.assertEquals("float", columns.get(2).getType());
    Assert.assertEquals("decimal_col", columns.get(3).getName());
    Assert.assertEquals("decimal(2,1)", columns.get(3).getType());
    // Change the column types on the table to long, double and decimal(6,1)
    icebergTable.updateSchema().updateColumn("int_col", Types.LongType.get()).updateColumn("float_col", Types.DoubleType.get()).updateColumn("decimal_col", Types.DecimalType.of(6, 1)).commit();
    schemaForResultSet = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "int_col", Types.LongType.get()), optional(3, "float_col", Types.DoubleType.get()), optional(4, "decimal_col", Types.StringType.get()));
    expectedResults = TestHelper.RecordsBuilder.newInstance(schema).add(0L, 35L, 22d, "1.3").add(1L, 223344L, 555.219970703125d, "2.2").add(2L, -234L, -342d, "-1.2").build();
    rows = shell.executeStatement("SELECT * FROM types_table");
    HiveIcebergTestUtils.validateData(expectedResults, HiveIcebergTestUtils.valueForRow(schemaForResultSet, rows), 0);
    // so no point in checking the column types.
    if (TestTables.TestTableType.HIVE_CATALOG.equals(this.testTableType)) {
        table = shell.metastore().getTable("default", "types_table");
        Assert.assertNotNull(table);
        Assert.assertNotNull(table.getSd());
        columns = table.getSd().getCols();
        Assert.assertEquals("id", columns.get(0).getName());
        Assert.assertEquals("bigint", columns.get(0).getType());
        Assert.assertEquals("int_col", columns.get(1).getName());
        Assert.assertEquals("bigint", columns.get(1).getType());
        Assert.assertEquals("float_col", columns.get(2).getName());
        Assert.assertEquals("double", columns.get(2).getType());
        Assert.assertEquals("decimal_col", columns.get(3).getName());
        Assert.assertEquals("decimal(6,1)", columns.get(3).getType());
    }
    // Insert some data which fit to the new column types and check if they are saved and can be queried correctly.
    // This should work for all catalog types.
    shell.executeStatement("INSERT INTO types_table values (3, 3147483647, 111.333, 12345.5), (4, -3147483648, 55, -2234.5)");
    expectedResults = TestHelper.RecordsBuilder.newInstance(schema).add(3L, 3147483647L, 111.333d, "12345.5").add(4L, -3147483648L, 55d, "-2234.5").build();
    rows = shell.executeStatement("SELECT * FROM types_table where id in(3, 4)");
    HiveIcebergTestUtils.validateData(expectedResults, HiveIcebergTestUtils.valueForRow(schemaForResultSet, rows), 0);
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 99 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergSchemaEvolution method testMoveLastNameToFirstInIcebergTable.

@Test
public void testMoveLastNameToFirstInIcebergTable() throws IOException {
    // Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data.
    Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
    // Move the last_name column in the table schema as first_column
    icebergTable.updateSchema().moveFirst("last_name").commit();
    Schema customerSchemaLastNameFirst = new Schema(optional(1, "last_name", Types.StringType.get(), "This is last name"), optional(2, "customer_id", Types.LongType.get()), optional(3, "first_name", Types.StringType.get(), "This is first name"));
    TestHelper.RecordsBuilder customersWithLastNameFirstBuilder = TestHelper.RecordsBuilder.newInstance(customerSchemaLastNameFirst).add("Brown", 0L, "Alice").add("Green", 1L, "Bob").add("Pink", 2L, "Trudy");
    List<Record> customersWithLastNameFirst = customersWithLastNameFirstBuilder.build();
    // Run a 'select *' to check if the order of the column in the result has been changed.
    List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithLastNameFirst, HiveIcebergTestUtils.valueForRow(customerSchemaLastNameFirst, rows), 1);
    // Query the data with names and check if the result is the same as when the table was created.
    rows = shell.executeStatement("SELECT customer_id, first_name, last_name FROM default.customers");
    HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, rows), 0);
    // Insert data from Hive to check if the last_name column has to be first in the values list.
    shell.executeStatement("INSERT INTO default.customers values ('Magenta', 3L, 'Lily')");
    customersWithLastNameFirstBuilder.add("Magenta", 3L, "Lily");
    customersWithLastNameFirst = customersWithLastNameFirstBuilder.build();
    rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithLastNameFirst, HiveIcebergTestUtils.valueForRow(customerSchemaLastNameFirst, rows), 1);
}
Also used : TestHelper(org.apache.iceberg.mr.TestHelper) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 100 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergSchemaEvolution method testRenameColumnInIcebergTable.

@Test
public void testRenameColumnInIcebergTable() throws IOException {
    // Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data.
    Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
    // Rename the last_name column to family_name
    icebergTable.updateSchema().renameColumn("last_name", "family_name").commit();
    if (testTableType != TestTables.TestTableType.HIVE_CATALOG) {
        // We need to update columns for non-Hive catalogs
        shell.executeStatement("ALTER TABLE customers UPDATE COLUMNS");
    }
    Schema schemaWithFamilyName = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "first_name", Types.StringType.get(), "This is first name"), optional(3, "family_name", Types.StringType.get(), "This is last name"));
    // Run a 'select *' from Hive to check if the same records are returned in the same order as before the rename.
    List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, HiveIcebergTestUtils.valueForRow(schemaWithFamilyName, rows), 0);
    Schema shemaWithFamilyNameOnly = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "family_name", Types.StringType.get(), "This is family name"));
    TestHelper.RecordsBuilder customersWithFamilyNameOnlyBuilder = TestHelper.RecordsBuilder.newInstance(shemaWithFamilyNameOnly).add(0L, "Brown").add(1L, "Green").add(2L, "Pink");
    List<Record> customersWithFamilyNameOnly = customersWithFamilyNameOnlyBuilder.build();
    // Run a 'select family_name' from Hive to check if the column can be queried with the new name.
    rows = shell.executeStatement("SELECT customer_id, family_name FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithFamilyNameOnly, HiveIcebergTestUtils.valueForRow(shemaWithFamilyNameOnly, rows), 0);
    // Run a 'select last_name' to check if an exception is thrown.
    AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, "Invalid table alias or column reference 'last_name'", () -> {
        shell.executeStatement("SELECT last_name FROM default.customers");
    });
    // Insert some data from Hive to check if the last_name column is still can be filled.
    shell.executeStatement("INSERT INTO default.customers values (3L, 'Lily', 'Magenta'), (4L, 'Roni', NULL)");
    List<Record> newCustomers = TestHelper.RecordsBuilder.newInstance(schemaWithFamilyName).add(0L, "Alice", "Brown").add(1L, "Bob", "Green").add(2L, "Trudy", "Pink").add(3L, "Lily", "Magenta").add(4L, "Roni", null).build();
    rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(newCustomers, HiveIcebergTestUtils.valueForRow(schemaWithFamilyName, rows), 0);
    customersWithFamilyNameOnlyBuilder.add(3L, "Magenta").add(4L, null);
    customersWithFamilyNameOnly = customersWithFamilyNameOnlyBuilder.build();
    rows = shell.executeStatement("SELECT customer_id, family_name FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithFamilyNameOnly, HiveIcebergTestUtils.valueForRow(shemaWithFamilyNameOnly, rows), 0);
}
Also used : TestHelper(org.apache.iceberg.mr.TestHelper) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Aggregations

Schema (org.apache.iceberg.Schema)126 Test (org.junit.Test)93 Record (org.apache.iceberg.data.Record)68 Table (org.apache.iceberg.Table)55 PartitionSpec (org.apache.iceberg.PartitionSpec)39 GenericRecord (org.apache.iceberg.data.GenericRecord)36 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)30 List (java.util.List)21 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)20 IOException (java.io.IOException)16 Types (org.apache.iceberg.types.Types)16 ArrayList (java.util.ArrayList)15 Map (java.util.Map)14 HashMap (java.util.HashMap)13 FileFormat (org.apache.iceberg.FileFormat)13 UpdateSchema (org.apache.iceberg.UpdateSchema)12 Path (org.apache.hadoop.fs.Path)11 Collectors (java.util.stream.Collectors)10 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)10 TestHelper (org.apache.iceberg.mr.TestHelper)9