use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergSchemaEvolution method testMoveCustomerIdAfterFirstNameInIcebergTable.
@Test
public void testMoveCustomerIdAfterFirstNameInIcebergTable() throws IOException {
// Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data.
Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
// Move the last_name column before the customer_id in the table schema.
icebergTable.updateSchema().moveAfter("customer_id", "first_name").commit();
Schema customerSchemaLastNameFirst = new Schema(optional(1, "first_name", Types.StringType.get(), "This is first name"), optional(2, "customer_id", Types.LongType.get()), optional(3, "last_name", Types.StringType.get(), "This is last name"));
TestHelper.RecordsBuilder customersWithLastNameFirstBuilder = TestHelper.RecordsBuilder.newInstance(customerSchemaLastNameFirst).add("Alice", 0L, "Brown").add("Bob", 1L, "Green").add("Trudy", 2L, "Pink");
List<Record> customersWithLastNameFirst = customersWithLastNameFirstBuilder.build();
// Run a 'select *' to check if the order of the column in the result has been changed.
List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
HiveIcebergTestUtils.validateData(customersWithLastNameFirst, HiveIcebergTestUtils.valueForRow(customerSchemaLastNameFirst, rows), 1);
// Query the data with names and check if the result is the same as when the table was created.
rows = shell.executeStatement("SELECT customer_id, first_name, last_name FROM default.customers");
HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, rows), 0);
// Insert data from Hive to check if the last_name column has to be before the customer_id in the values list.
shell.executeStatement("INSERT INTO default.customers values ('Lily', 3L, 'Magenta')");
customersWithLastNameFirstBuilder.add("Lily", 3L, "Magenta");
customersWithLastNameFirst = customersWithLastNameFirstBuilder.build();
rows = shell.executeStatement("SELECT * FROM default.customers");
HiveIcebergTestUtils.validateData(customersWithLastNameFirst, HiveIcebergTestUtils.valueForRow(customerSchemaLastNameFirst, rows), 1);
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergSchemaEvolution method testUpdateColumnTypeInIcebergTable.
@Test
public void testUpdateColumnTypeInIcebergTable() throws IOException, TException, InterruptedException {
// Create an Iceberg table with int, float and decimal(2,1) types with some initial records
Schema schema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "int_col", Types.IntegerType.get(), "This is an integer type"), optional(3, "float_col", Types.FloatType.get(), "This is a float type"), optional(4, "decimal_col", Types.DecimalType.of(2, 1), "This is a decimal type"));
List<Record> records = TestHelper.RecordsBuilder.newInstance(schema).add(0L, 35, 22F, BigDecimal.valueOf(13L, 1)).add(1L, 223344, 555.22F, BigDecimal.valueOf(22L, 1)).add(2L, -234, -342F, BigDecimal.valueOf(-12L, 1)).build();
Table icebergTable = testTables.createTable(shell, "types_table", schema, fileFormat, records);
// In the result set a float column is returned as double and a decimal is returned as string,
// even though Hive has the columns with the right types.
// Probably this conversation happens when fetching the result set after calling the select through the shell.
// Because of this, a separate schema and record list has to be used when validating the returned values.
Schema schemaForResultSet = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "int_col", Types.IntegerType.get()), optional(3, "float_col", Types.DoubleType.get()), optional(4, "decimal_col", Types.StringType.get()));
List<Record> expectedResults = TestHelper.RecordsBuilder.newInstance(schema).add(0L, 35, 22d, "1.3").add(1L, 223344, 555.22d, "2.2").add(2L, -234, -342d, "-1.2").build();
// Check the select result and the column types of the table.
List<Object[]> rows = shell.executeStatement("SELECT * FROM types_table");
HiveIcebergTestUtils.validateData(expectedResults, HiveIcebergTestUtils.valueForRow(schemaForResultSet, rows), 0);
org.apache.hadoop.hive.metastore.api.Table table = shell.metastore().getTable("default", "types_table");
Assert.assertNotNull(table);
Assert.assertNotNull(table.getSd());
List<FieldSchema> columns = table.getSd().getCols();
Assert.assertEquals("id", columns.get(0).getName());
Assert.assertEquals("bigint", columns.get(0).getType());
Assert.assertEquals("int_col", columns.get(1).getName());
Assert.assertEquals("int", columns.get(1).getType());
Assert.assertEquals("float_col", columns.get(2).getName());
Assert.assertEquals("float", columns.get(2).getType());
Assert.assertEquals("decimal_col", columns.get(3).getName());
Assert.assertEquals("decimal(2,1)", columns.get(3).getType());
// Change the column types on the table to long, double and decimal(6,1)
icebergTable.updateSchema().updateColumn("int_col", Types.LongType.get()).updateColumn("float_col", Types.DoubleType.get()).updateColumn("decimal_col", Types.DecimalType.of(6, 1)).commit();
schemaForResultSet = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "int_col", Types.LongType.get()), optional(3, "float_col", Types.DoubleType.get()), optional(4, "decimal_col", Types.StringType.get()));
expectedResults = TestHelper.RecordsBuilder.newInstance(schema).add(0L, 35L, 22d, "1.3").add(1L, 223344L, 555.219970703125d, "2.2").add(2L, -234L, -342d, "-1.2").build();
rows = shell.executeStatement("SELECT * FROM types_table");
HiveIcebergTestUtils.validateData(expectedResults, HiveIcebergTestUtils.valueForRow(schemaForResultSet, rows), 0);
// so no point in checking the column types.
if (TestTables.TestTableType.HIVE_CATALOG.equals(this.testTableType)) {
table = shell.metastore().getTable("default", "types_table");
Assert.assertNotNull(table);
Assert.assertNotNull(table.getSd());
columns = table.getSd().getCols();
Assert.assertEquals("id", columns.get(0).getName());
Assert.assertEquals("bigint", columns.get(0).getType());
Assert.assertEquals("int_col", columns.get(1).getName());
Assert.assertEquals("bigint", columns.get(1).getType());
Assert.assertEquals("float_col", columns.get(2).getName());
Assert.assertEquals("double", columns.get(2).getType());
Assert.assertEquals("decimal_col", columns.get(3).getName());
Assert.assertEquals("decimal(6,1)", columns.get(3).getType());
}
// Insert some data which fit to the new column types and check if they are saved and can be queried correctly.
// This should work for all catalog types.
shell.executeStatement("INSERT INTO types_table values (3, 3147483647, 111.333, 12345.5), (4, -3147483648, 55, -2234.5)");
expectedResults = TestHelper.RecordsBuilder.newInstance(schema).add(3L, 3147483647L, 111.333d, "12345.5").add(4L, -3147483648L, 55d, "-2234.5").build();
rows = shell.executeStatement("SELECT * FROM types_table where id in(3, 4)");
HiveIcebergTestUtils.validateData(expectedResults, HiveIcebergTestUtils.valueForRow(schemaForResultSet, rows), 0);
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergSchemaEvolution method testMoveLastNameToFirstInIcebergTable.
@Test
public void testMoveLastNameToFirstInIcebergTable() throws IOException {
// Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data.
Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
// Move the last_name column in the table schema as first_column
icebergTable.updateSchema().moveFirst("last_name").commit();
Schema customerSchemaLastNameFirst = new Schema(optional(1, "last_name", Types.StringType.get(), "This is last name"), optional(2, "customer_id", Types.LongType.get()), optional(3, "first_name", Types.StringType.get(), "This is first name"));
TestHelper.RecordsBuilder customersWithLastNameFirstBuilder = TestHelper.RecordsBuilder.newInstance(customerSchemaLastNameFirst).add("Brown", 0L, "Alice").add("Green", 1L, "Bob").add("Pink", 2L, "Trudy");
List<Record> customersWithLastNameFirst = customersWithLastNameFirstBuilder.build();
// Run a 'select *' to check if the order of the column in the result has been changed.
List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
HiveIcebergTestUtils.validateData(customersWithLastNameFirst, HiveIcebergTestUtils.valueForRow(customerSchemaLastNameFirst, rows), 1);
// Query the data with names and check if the result is the same as when the table was created.
rows = shell.executeStatement("SELECT customer_id, first_name, last_name FROM default.customers");
HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, rows), 0);
// Insert data from Hive to check if the last_name column has to be first in the values list.
shell.executeStatement("INSERT INTO default.customers values ('Magenta', 3L, 'Lily')");
customersWithLastNameFirstBuilder.add("Magenta", 3L, "Lily");
customersWithLastNameFirst = customersWithLastNameFirstBuilder.build();
rows = shell.executeStatement("SELECT * FROM default.customers");
HiveIcebergTestUtils.validateData(customersWithLastNameFirst, HiveIcebergTestUtils.valueForRow(customerSchemaLastNameFirst, rows), 1);
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergSchemaEvolution method testRenameColumnInIcebergTable.
@Test
public void testRenameColumnInIcebergTable() throws IOException {
// Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data.
Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
// Rename the last_name column to family_name
icebergTable.updateSchema().renameColumn("last_name", "family_name").commit();
if (testTableType != TestTables.TestTableType.HIVE_CATALOG) {
// We need to update columns for non-Hive catalogs
shell.executeStatement("ALTER TABLE customers UPDATE COLUMNS");
}
Schema schemaWithFamilyName = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "first_name", Types.StringType.get(), "This is first name"), optional(3, "family_name", Types.StringType.get(), "This is last name"));
// Run a 'select *' from Hive to check if the same records are returned in the same order as before the rename.
List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, HiveIcebergTestUtils.valueForRow(schemaWithFamilyName, rows), 0);
Schema shemaWithFamilyNameOnly = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "family_name", Types.StringType.get(), "This is family name"));
TestHelper.RecordsBuilder customersWithFamilyNameOnlyBuilder = TestHelper.RecordsBuilder.newInstance(shemaWithFamilyNameOnly).add(0L, "Brown").add(1L, "Green").add(2L, "Pink");
List<Record> customersWithFamilyNameOnly = customersWithFamilyNameOnlyBuilder.build();
// Run a 'select family_name' from Hive to check if the column can be queried with the new name.
rows = shell.executeStatement("SELECT customer_id, family_name FROM default.customers");
HiveIcebergTestUtils.validateData(customersWithFamilyNameOnly, HiveIcebergTestUtils.valueForRow(shemaWithFamilyNameOnly, rows), 0);
// Run a 'select last_name' to check if an exception is thrown.
AssertHelpers.assertThrows("should throw exception", IllegalArgumentException.class, "Invalid table alias or column reference 'last_name'", () -> {
shell.executeStatement("SELECT last_name FROM default.customers");
});
// Insert some data from Hive to check if the last_name column is still can be filled.
shell.executeStatement("INSERT INTO default.customers values (3L, 'Lily', 'Magenta'), (4L, 'Roni', NULL)");
List<Record> newCustomers = TestHelper.RecordsBuilder.newInstance(schemaWithFamilyName).add(0L, "Alice", "Brown").add(1L, "Bob", "Green").add(2L, "Trudy", "Pink").add(3L, "Lily", "Magenta").add(4L, "Roni", null).build();
rows = shell.executeStatement("SELECT * FROM default.customers");
HiveIcebergTestUtils.validateData(newCustomers, HiveIcebergTestUtils.valueForRow(schemaWithFamilyName, rows), 0);
customersWithFamilyNameOnlyBuilder.add(3L, "Magenta").add(4L, null);
customersWithFamilyNameOnly = customersWithFamilyNameOnlyBuilder.build();
rows = shell.executeStatement("SELECT customer_id, family_name FROM default.customers");
HiveIcebergTestUtils.validateData(customersWithFamilyNameOnly, HiveIcebergTestUtils.valueForRow(shemaWithFamilyNameOnly, rows), 0);
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergSchemaEvolution method testAddRequiredColumnToIcebergTable.
@Test
public void testAddRequiredColumnToIcebergTable() throws IOException {
// Create an Iceberg table with the columns customer_id, first_name and last_name without initial data.
// The reason why not to add initial data is that adding a required column is an incompatible change in Iceberg.
// So there is no contract on what happens when trying to read the old data back. It behaves differently depending
// on the underlying file format. So there is no point creating a test for that as there is no expected behaviour.
Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, null);
// Add a new required column (age long) to the Iceberg table.
icebergTable.updateSchema().allowIncompatibleChanges().addRequiredColumn("age", Types.LongType.get()).commit();
if (testTableType != TestTables.TestTableType.HIVE_CATALOG) {
// We need to update columns for non-Hive catalogs
shell.executeStatement("ALTER TABLE customers UPDATE COLUMNS");
}
Schema customerSchemaWithAge = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "first_name", Types.StringType.get(), "This is first name"), optional(3, "last_name", Types.StringType.get(), "This is last name"), required(4, "age", Types.LongType.get()));
// Insert some data with age column from Hive.
shell.executeStatement("INSERT INTO default.customers values (0L, 'Lily', 'Magenta', 28L), (1L, 'Roni', 'Purple', 33L)");
// Do a 'select *' from Hive and check if the age column appears in the result.
List<Record> customersWithAge = TestHelper.RecordsBuilder.newInstance(customerSchemaWithAge).add(0L, "Lily", "Magenta", 28L).add(1L, "Roni", "Purple", 33L).build();
List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
HiveIcebergTestUtils.validateData(customersWithAge, HiveIcebergTestUtils.valueForRow(customerSchemaWithAge, rows), 0);
// Should add test step to insert NULL value into the new required column. But at the moment it
// works inconsistently for different file types, so leave it for later when this behaviour is cleaned up.
}
Aggregations