Search in sources :

Example 86 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class HiveIcebergTestUtils method createPositionalDeleteFile.

/**
 * @param table The table to create the delete file for
 * @param deleteFilePath The path where the delete file should be created, relative to the table location root
 * @param fileFormat The file format that should be used for writing out the delete file
 * @param partitionValues A map of partition values (partitionKey=partitionVal, ...) to be used for the delete file
 * @param deletes The list of position deletes, each containing the data file path, the position of the row in the
 *                data file and the row itself that should be deleted
 * @return The DeleteFile created
 * @throws IOException If there is an error during DeleteFile write
 */
public static DeleteFile createPositionalDeleteFile(Table table, String deleteFilePath, FileFormat fileFormat, Map<String, Object> partitionValues, List<PositionDelete<Record>> deletes) throws IOException {
    Schema posDeleteRowSchema = deletes.get(0).row() == null ? null : table.schema();
    FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), null, null, posDeleteRowSchema);
    EncryptedOutputFile outputFile = table.encryption().encrypt(HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(table.location(), deleteFilePath), new Configuration()));
    PartitionKey partitionKey = null;
    if (partitionValues != null) {
        Record record = GenericRecord.create(table.schema()).copy(partitionValues);
        partitionKey = new PartitionKey(table.spec(), table.schema());
        partitionKey.partition(record);
    }
    PositionDeleteWriter<Record> posWriter = appenderFactory.newPosDeleteWriter(outputFile, fileFormat, partitionKey);
    try (PositionDeleteWriter<Record> writer = posWriter) {
        deletes.forEach(del -> writer.delete(del.path(), del.pos(), del.row()));
    }
    return posWriter.toDeleteFile();
}
Also used : Path(java.nio.file.Path) Configuration(org.apache.hadoop.conf.Configuration) EncryptedOutputFile(org.apache.iceberg.encryption.EncryptedOutputFile) Schema(org.apache.iceberg.Schema) PartitionKey(org.apache.iceberg.PartitionKey) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory)

Example 87 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergOutputCommitter method testRetryTask.

@Test
public void testRetryTask() throws IOException {
    HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter();
    Table table = table(temp.getRoot().getPath(), false);
    JobConf conf = jobConf(table, 2);
    // Write records and abort the tasks
    writeRecords(table.name(), 2, 0, false, true, conf);
    HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 0);
    HiveIcebergTestUtils.validateData(table, Collections.emptyList(), 0);
    // Write records but do not abort the tasks
    // The data files remain since we can not identify them but should not be read
    writeRecords(table.name(), 2, 1, false, false, conf);
    HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 2);
    HiveIcebergTestUtils.validateData(table, Collections.emptyList(), 0);
    // Write and commit the records
    List<Record> expected = writeRecords(table.name(), 2, 2, true, false, conf);
    committer.commitJob(new JobContextImpl(conf, JOB_ID));
    HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 4);
    HiveIcebergTestUtils.validateData(table, expected, 0);
}
Also used : JobContextImpl(org.apache.hadoop.mapred.JobContextImpl) Table(org.apache.iceberg.Table) Record(org.apache.iceberg.data.Record) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.Test)

Example 88 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergSchemaEvolution method testMakeColumnRequiredInIcebergTable.

@Test
public void testMakeColumnRequiredInIcebergTable() throws IOException {
    // Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data.
    Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
    // Add a new required column (age long) to the Iceberg table.
    icebergTable.updateSchema().allowIncompatibleChanges().requireColumn("last_name").commit();
    List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, rows), 0);
    // Insert some data with last_name no NULL.
    shell.executeStatement("INSERT INTO default.customers values (3L, 'Lily', 'Magenta'), (4L, 'Roni', 'Purple')");
    List<Record> customerRecords = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Alice", "Brown").add(1L, "Bob", "Green").add(2L, "Trudy", "Pink").add(3L, "Lily", "Magenta").add(4L, "Roni", "Purple").build();
    rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(customerRecords, HiveIcebergTestUtils.valueForRow(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, rows), 0);
// Should add test step to insert NULL value into the new required column. But at the moment it
// works inconsistently for different file types, so leave it for later when this behaviour is cleaned up.
}
Also used : Table(org.apache.iceberg.Table) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 89 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergSchemaEvolution method testRemoveAndAddBackColumnFromIcebergTable.

@Test
public void testRemoveAndAddBackColumnFromIcebergTable() throws IOException {
    // Create an Iceberg table with the columns customer_id, first_name and last_name with some initial data.
    Table icebergTable = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
    // Remove the first_name column
    icebergTable.updateSchema().deleteColumn("first_name").commit();
    // Add a new column with the name first_name
    icebergTable.updateSchema().addColumn("first_name", Types.StringType.get(), "This is new first name").commit();
    // Add new data to the table with the new first_name column filled.
    icebergTable = testTables.loadTable(TableIdentifier.of("default", "customers"));
    Schema customerSchemaWithNewFirstName = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "last_name", Types.StringType.get(), "This is last name"), optional(3, "first_name", Types.StringType.get(), "This is the newly added first name"));
    List<Record> newCustomersWithNewFirstName = TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstName).add(3L, "Red", "James").build();
    testTables.appendIcebergTable(shell.getHiveConf(), icebergTable, fileFormat, null, newCustomersWithNewFirstName);
    TestHelper.RecordsBuilder customersWithNewFirstNameBuilder = TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstName).add(0L, "Brown", null).add(1L, "Green", null).add(2L, "Pink", null).add(3L, "Red", "James");
    List<Record> customersWithNewFirstName = customersWithNewFirstNameBuilder.build();
    // Run a 'select *' from Hive and check if the first_name column is returned.
    // It should be null for the old data and should be filled in the entry added after the column addition.
    List<Object[]> rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithNewFirstName, HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstName, rows), 0);
    Schema customerSchemaWithNewFirstNameOnly = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(3, "first_name", Types.StringType.get(), "This is the newly added first name"));
    TestHelper.RecordsBuilder customersWithNewFirstNameOnlyBuilder = TestHelper.RecordsBuilder.newInstance(customerSchemaWithNewFirstNameOnly).add(0L, null).add(1L, null).add(2L, null).add(3L, "James");
    List<Record> customersWithNewFirstNameOnly = customersWithNewFirstNameOnlyBuilder.build();
    // Run a 'select first_name' from Hive to check if the new first-name column can be queried.
    rows = shell.executeStatement("SELECT customer_id, first_name FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithNewFirstNameOnly, HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstNameOnly, rows), 0);
    // Insert data from Hive with first_name filled and with null first_name value.
    shell.executeStatement("INSERT INTO default.customers values (4L, 'Magenta', 'Lily'), (5L, 'Purple', NULL)");
    // Check if the newly inserted data is returned correctly by select statements.
    customersWithNewFirstNameBuilder.add(4L, "Magenta", "Lily").add(5L, "Purple", null);
    customersWithNewFirstName = customersWithNewFirstNameBuilder.build();
    rows = shell.executeStatement("SELECT * FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithNewFirstName, HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstName, rows), 0);
    customersWithNewFirstNameOnlyBuilder.add(4L, "Lily").add(5L, null);
    customersWithNewFirstNameOnly = customersWithNewFirstNameOnlyBuilder.build();
    rows = shell.executeStatement("SELECT customer_id, first_name FROM default.customers");
    HiveIcebergTestUtils.validateData(customersWithNewFirstNameOnly, HiveIcebergTestUtils.valueForRow(customerSchemaWithNewFirstNameOnly, rows), 0);
}
Also used : TestHelper(org.apache.iceberg.mr.TestHelper) Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 90 with Record

use of org.apache.iceberg.data.Record in project hive by apache.

the class TestHiveIcebergSchemaEvolution method testSchemaEvolutionOnVectorizedReads.

// Tests CHANGE COLUMN feature similarly like above, but with a more complex schema, aimed to verify vectorized
// reads support the feature properly, also combining with other schema changes e.g. ADD COLUMN
@Test
public void testSchemaEvolutionOnVectorizedReads() throws Exception {
    // Currently only ORC, but in the future this should run against each fileformat with vectorized read support.
    Assume.assumeTrue("Vectorized reads only.", isVectorized);
    Schema orderSchema = new Schema(optional(1, "order_id", Types.IntegerType.get()), optional(2, "customer_first_name", Types.StringType.get()), optional(3, "customer_last_name", Types.StringType.get()), optional(4, "quantity", Types.IntegerType.get()), optional(5, "price", Types.IntegerType.get()), optional(6, "item", Types.StringType.get()));
    List<Record> records = TestHelper.RecordsBuilder.newInstance(orderSchema).add(1, "Doctor", "Strange", 100, 3, "apple").add(2, "Tony", "Stark", 150, 2, "apple").add(3, "Tony", "Stark", 200, 6, "orange").add(4, "Steve", "Rogers", 100, 8, "banana").add(5, "Doctor", "Strange", 800, 7, "orange").add(6, "Thor", "Odinson", 650, 3, "apple").build();
    testTables.createTable(shell, "orders", orderSchema, fileFormat, records);
    // Reorder columns and rename one column
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN " + "customer_first_name customer_first_name string AFTER customer_last_name");
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN " + "quantity quantity int AFTER price");
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN " + "item fruit string");
    List<Object[]> result = shell.executeStatement("SELECT customer_first_name, customer_last_name, SUM(quantity) " + "FROM orders where price >= 3 group by customer_first_name, customer_last_name order by customer_first_name");
    assertQueryResult(result, 4, "Doctor", "Strange", 900L, "Steve", "Rogers", 100L, "Thor", "Odinson", 650L, "Tony", "Stark", 200L);
    // Adding a new column (will end up as last col of the schema)
    shell.executeStatement("ALTER TABLE orders ADD COLUMNS (nickname string)");
    shell.executeStatement("INSERT INTO orders VALUES (7, 'Romanoff', 'Natasha', 3, 250, 'apple', 'Black Widow')");
    result = shell.executeStatement("SELECT customer_first_name, customer_last_name, nickname, SUM(quantity) " + " FROM orders where price >= 3 group by customer_first_name, customer_last_name, nickname " + " order by customer_first_name");
    assertQueryResult(result, 5, "Doctor", "Strange", null, 900L, "Natasha", "Romanoff", "Black Widow", 250L, "Steve", "Rogers", null, 100L, "Thor", "Odinson", null, 650L, "Tony", "Stark", null, 200L);
    // Re-order newly added column (move it away from being the last column)
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN fruit fruit string AFTER nickname");
    result = shell.executeStatement("SELECT customer_first_name, customer_last_name, nickname, fruit, SUM(quantity) " + " FROM orders where price >= 3 and fruit < 'o' group by customer_first_name, customer_last_name, nickname, " + "fruit order by customer_first_name");
    assertQueryResult(result, 4, "Doctor", "Strange", null, "apple", 100L, "Natasha", "Romanoff", "Black Widow", "apple", 250L, "Steve", "Rogers", null, "banana", 100L, "Thor", "Odinson", null, "apple", 650L);
    // Rename newly added column (+ reading with different file includes)
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN nickname nick string");
    result = shell.executeStatement("SELECT customer_first_name, nick, SUM(quantity) " + " FROM orders where fruit < 'o'and nick IS NOT NULL group by customer_first_name, nick");
    assertQueryResult(result, 1, "Natasha", "Black Widow", 250L);
    // Re-order between different types
    shell.executeStatement("ALTER TABLE orders CHANGE COLUMN order_id order_id int AFTER customer_first_name");
    result = shell.executeStatement("SELECT customer_first_name, nick, SUM(quantity), MIN(order_id) " + " FROM orders where fruit < 'o'and nick IS NOT NULL group by customer_first_name, nick");
    assertQueryResult(result, 1, "Natasha", "Black Widow", 250L, 7);
    // Drop columns via REPLACE COLUMNS
    shell.executeStatement("ALTER TABLE orders REPLACE COLUMNS (" + "customer_last_name string COMMENT 'from deserializer', order_id int COMMENT 'from deserializer'," + " quantity int, nick string COMMENT 'from deserializer'," + " fruit string COMMENT 'from deserializer')");
    result = shell.executeStatement("DESCRIBE orders");
    assertQueryResult(result, 5, "customer_last_name", "string", "from deserializer", "order_id", "int", "from deserializer", "quantity", "int", "from deserializer", "nick", "string", "from deserializer", "fruit", "string", "from deserializer");
    result = shell.executeStatement("SELECT * FROM orders ORDER BY order_id");
    assertQueryResult(result, 7, "Strange", 1, 100, null, "apple", "Stark", 2, 150, null, "apple", "Stark", 3, 200, null, "orange", "Rogers", 4, 100, null, "banana", "Strange", 5, 800, null, "orange", "Odinson", 6, 650, null, "apple", "Romanoff", 7, 250, "Black Widow", "apple");
}
Also used : Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Aggregations

Record (org.apache.iceberg.data.Record)114 Test (org.junit.Test)99 Schema (org.apache.iceberg.Schema)68 Table (org.apache.iceberg.Table)51 GenericRecord (org.apache.iceberg.data.GenericRecord)51 PartitionSpec (org.apache.iceberg.PartitionSpec)19 ArrayList (java.util.ArrayList)14 List (java.util.List)13 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)12 HashMap (java.util.HashMap)11 IcebergBaseTest (org.apache.drill.metastore.iceberg.IcebergBaseTest)11 TestHelper (org.apache.iceberg.mr.TestHelper)11 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)10 Types (org.apache.iceberg.types.Types)10 Map (java.util.Map)9 IOException (java.io.IOException)8 ImmutableMap (org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap)8 FileFormat (org.apache.iceberg.FileFormat)7 DeleteFile (org.apache.iceberg.DeleteFile)6 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)6