use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergInserts method testMultiTableInsert.
@Test
public void testMultiTableInsert() throws IOException {
testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
Schema target1Schema = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "first_name", Types.StringType.get()));
Schema target2Schema = new Schema(optional(1, "last_name", Types.StringType.get()), optional(2, "customer_id", Types.LongType.get()));
List<Record> target1Records = TestHelper.RecordsBuilder.newInstance(target1Schema).add(0L, "Alice").add(1L, "Bob").add(2L, "Trudy").build();
List<Record> target2Records = TestHelper.RecordsBuilder.newInstance(target2Schema).add("Brown", 0L).add("Green", 1L).add("Pink", 2L).build();
Table target1 = testTables.createTable(shell, "target1", target1Schema, fileFormat, ImmutableList.of());
Table target2 = testTables.createTable(shell, "target2", target2Schema, fileFormat, ImmutableList.of());
// simple insert: should create a single vertex writing to both target tables
shell.executeStatement("FROM customers " + "INSERT INTO target1 SELECT customer_id, first_name " + "INSERT INTO target2 SELECT last_name, customer_id");
// Check that everything is as expected
HiveIcebergTestUtils.validateData(target1, target1Records, 0);
HiveIcebergTestUtils.validateData(target2, target2Records, 1);
// truncate the target tables
testTables.truncateIcebergTable(target1);
testTables.truncateIcebergTable(target2);
// complex insert: should use a different vertex for each target table
shell.executeStatement("FROM customers " + "INSERT INTO target1 SELECT customer_id, first_name ORDER BY first_name " + "INSERT INTO target2 SELECT last_name, customer_id ORDER BY last_name");
// Check that everything is as expected
HiveIcebergTestUtils.validateData(target1, target1Records, 0);
HiveIcebergTestUtils.validateData(target2, target2Records, 1);
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergInserts method testInsertOverwriteNonPartitionedTable.
@Test
public void testInsertOverwriteNonPartitionedTable() throws IOException {
TableIdentifier target = TableIdentifier.of("default", "target");
Table table = testTables.createTable(shell, target.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, ImmutableList.of());
// IOW overwrites the whole table (empty target table)
testTables.createTable(shell, "source", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
shell.executeStatement("INSERT OVERWRITE TABLE target SELECT * FROM source");
HiveIcebergTestUtils.validateData(table, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 0);
// IOW overwrites the whole table (non-empty target table)
List<Record> newRecords = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Mike", "Taylor").add(1L, "Christy", "Hubert").build();
shell.executeStatement(testTables.getInsertQuery(newRecords, target, true));
HiveIcebergTestUtils.validateData(table, newRecords, 0);
// IOW empty result set -> clears the target table
shell.executeStatement("INSERT OVERWRITE TABLE target SELECT * FROM source WHERE FALSE");
HiveIcebergTestUtils.validateData(table, ImmutableList.of(), 0);
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergInserts method testInsertFromSelectWithProjection.
@Test
public void testInsertFromSelectWithProjection() throws IOException {
Table table = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, ImmutableList.of());
testTables.createTable(shell, "orders", ORDER_SCHEMA, fileFormat, ORDER_RECORDS);
shell.executeStatement("INSERT INTO customers (customer_id, last_name) SELECT distinct(customer_id), 'test' FROM orders");
List<Record> expected = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, null, "test").add(1L, null, "test").build();
HiveIcebergTestUtils.validateData(table, expected, 0);
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergOutputCommitter method writeRecords.
/**
* Write random records to the given table using separate {@link HiveIcebergOutputCommitter} and
* a separate {@link HiveIcebergRecordWriter} for every task.
* @param name The name of the table to get the table object from the conf
* @param taskNum The number of tasks in the job handled by the committer
* @param attemptNum The id used for attempt number generation
* @param commitTasks If <code>true</code> the tasks will be committed
* @param abortTasks If <code>true</code> the tasks will be aborted - needed so we can simulate no commit/no abort
* situation
* @param conf The job configuration
* @param committer The output committer that should be used for committing/aborting the tasks
* @return The random generated records which were appended to the table
* @throws IOException Propagating {@link HiveIcebergRecordWriter} exceptions
*/
private List<Record> writeRecords(String name, int taskNum, int attemptNum, boolean commitTasks, boolean abortTasks, JobConf conf, OutputCommitter committer) throws IOException {
List<Record> expected = new ArrayList<>(RECORD_NUM * taskNum);
Table table = HiveIcebergStorageHandler.table(conf, name);
FileIO io = table.io();
Schema schema = HiveIcebergStorageHandler.schema(conf);
PartitionSpec spec = table.spec();
for (int i = 0; i < taskNum; ++i) {
List<Record> records = TestHelper.generateRandomRecords(schema, RECORD_NUM, i + attemptNum);
TaskAttemptID taskId = new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.MAP, i, attemptNum);
int partitionId = taskId.getTaskID().getId();
String operationId = QUERY_ID + "-" + JOB_ID;
FileFormat fileFormat = FileFormat.PARQUET;
OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, attemptNum).format(fileFormat).operationId(operationId).build();
HiveFileWriterFactory hfwf = new HiveFileWriterFactory(table, fileFormat, schema, null, fileFormat, null, null, null, null);
HiveIcebergRecordWriter testWriter = new HiveIcebergRecordWriter(schema, spec, fileFormat, hfwf, outputFileFactory, io, TARGET_FILE_SIZE, TezUtil.taskAttemptWrapper(taskId), conf.get(Catalogs.NAME));
Container<Record> container = new Container<>();
for (Record record : records) {
container.set(record);
testWriter.write(container);
}
testWriter.close(false);
if (commitTasks) {
committer.commitTask(new TaskAttemptContextImpl(conf, taskId));
expected.addAll(records);
} else if (abortTasks) {
committer.abortTask(new TaskAttemptContextImpl(conf, taskId));
}
}
return expected;
}
use of org.apache.iceberg.data.Record in project hive by apache.
the class TestHiveIcebergOutputCommitter method testSuccessfulMultipleTasksPartitionedWrite.
@Test
public void testSuccessfulMultipleTasksPartitionedWrite() throws IOException {
HiveIcebergOutputCommitter committer = new HiveIcebergOutputCommitter();
Table table = table(temp.getRoot().getPath(), true);
JobConf conf = jobConf(table, 2);
List<Record> expected = writeRecords(table.name(), 2, 0, true, false, conf);
committer.commitJob(new JobContextImpl(conf, JOB_ID));
// Expecting 6 files with fanout-, 8 with ClusteredWriter where writing to already completed partitions is allowed.
HiveIcebergTestUtils.validateFiles(table, conf, JOB_ID, 8);
HiveIcebergTestUtils.validateData(table, expected, 0);
}
Aggregations