Search in sources :

Example 81 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergInserts method testMultiTableInsert.

@Test
public void testMultiTableInsert() throws IOException {
    testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
    Schema target1Schema = new Schema(optional(1, "customer_id", Types.LongType.get()), optional(2, "first_name", Types.StringType.get()));
    Schema target2Schema = new Schema(optional(1, "last_name", Types.StringType.get()), optional(2, "customer_id", Types.LongType.get()));
    List<Record> target1Records = TestHelper.RecordsBuilder.newInstance(target1Schema).add(0L, "Alice").add(1L, "Bob").add(2L, "Trudy").build();
    List<Record> target2Records = TestHelper.RecordsBuilder.newInstance(target2Schema).add("Brown", 0L).add("Green", 1L).add("Pink", 2L).build();
    Table target1 = testTables.createTable(shell, "target1", target1Schema, fileFormat, ImmutableList.of());
    Table target2 = testTables.createTable(shell, "target2", target2Schema, fileFormat, ImmutableList.of());
    // simple insert: should create a single vertex writing to both target tables
    shell.executeStatement("FROM customers " + "INSERT INTO target1 SELECT customer_id, first_name " + "INSERT INTO target2 SELECT last_name, customer_id");
    // Check that everything is as expected
    HiveIcebergTestUtils.validateData(target1, target1Records, 0);
    HiveIcebergTestUtils.validateData(target2, target2Records, 1);
    // truncate the target tables
    testTables.truncateIcebergTable(target1);
    testTables.truncateIcebergTable(target2);
    // complex insert: should use a different vertex for each target table
    shell.executeStatement("FROM customers " + "INSERT INTO target1 SELECT customer_id, first_name ORDER BY first_name " + "INSERT INTO target2 SELECT last_name, customer_id ORDER BY last_name");
    // Check that everything is as expected
    HiveIcebergTestUtils.validateData(target1, target1Records, 0);
    HiveIcebergTestUtils.validateData(target2, target2Records, 1);
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) Test(org.junit.Test)

Example 82 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class TestHiveIcebergOutputCommitter method writeRecords.

/**
 * Write random records to the given table using separate {@link HiveIcebergOutputCommitter} and
 * a separate {@link HiveIcebergRecordWriter} for every task.
 * @param name The name of the table to get the table object from the conf
 * @param taskNum The number of tasks in the job handled by the committer
 * @param attemptNum The id used for attempt number generation
 * @param commitTasks If <code>true</code> the tasks will be committed
 * @param abortTasks If <code>true</code> the tasks will be aborted - needed so we can simulate no commit/no abort
 *                   situation
 * @param conf The job configuration
 * @param committer The output committer that should be used for committing/aborting the tasks
 * @return The random generated records which were appended to the table
 * @throws IOException Propagating {@link HiveIcebergRecordWriter} exceptions
 */
private List<Record> writeRecords(String name, int taskNum, int attemptNum, boolean commitTasks, boolean abortTasks, JobConf conf, OutputCommitter committer) throws IOException {
    List<Record> expected = new ArrayList<>(RECORD_NUM * taskNum);
    Table table = HiveIcebergStorageHandler.table(conf, name);
    FileIO io = table.io();
    Schema schema = HiveIcebergStorageHandler.schema(conf);
    PartitionSpec spec = table.spec();
    for (int i = 0; i < taskNum; ++i) {
        List<Record> records = TestHelper.generateRandomRecords(schema, RECORD_NUM, i + attemptNum);
        TaskAttemptID taskId = new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.MAP, i, attemptNum);
        int partitionId = taskId.getTaskID().getId();
        String operationId = QUERY_ID + "-" + JOB_ID;
        FileFormat fileFormat = FileFormat.PARQUET;
        OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, attemptNum).format(fileFormat).operationId(operationId).build();
        HiveFileWriterFactory hfwf = new HiveFileWriterFactory(table, fileFormat, schema, null, fileFormat, null, null, null, null);
        HiveIcebergRecordWriter testWriter = new HiveIcebergRecordWriter(schema, spec, fileFormat, hfwf, outputFileFactory, io, TARGET_FILE_SIZE, TezUtil.taskAttemptWrapper(taskId), conf.get(Catalogs.NAME));
        Container<Record> container = new Container<>();
        for (Record record : records) {
            container.set(record);
            testWriter.write(container);
        }
        testWriter.close(false);
        if (commitTasks) {
            committer.commitTask(new TaskAttemptContextImpl(conf, taskId));
            expected.addAll(records);
        } else if (abortTasks) {
            committer.abortTask(new TaskAttemptContextImpl(conf, taskId));
        }
    }
    return expected;
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) Table(org.apache.iceberg.Table) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) Schema(org.apache.iceberg.Schema) ArrayList(java.util.ArrayList) FileFormat(org.apache.iceberg.FileFormat) PartitionSpec(org.apache.iceberg.PartitionSpec) FileIO(org.apache.iceberg.io.FileIO) Container(org.apache.iceberg.mr.mapred.Container) TaskAttemptContextImpl(org.apache.hadoop.mapred.TaskAttemptContextImpl) Record(org.apache.iceberg.data.Record)

Example 83 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class DeleteReadTests method testMixedPositionAndEqualityDeletes.

@Test
public void testMixedPositionAndEqualityDeletes() throws IOException {
    Schema dataSchema = table.schema().select("data");
    Record dataDelete = GenericRecord.create(dataSchema);
    List<Record> dataDeletes = Lists.newArrayList(// id = 29
    dataDelete.copy("data", "a"), // id = 89
    dataDelete.copy("data", "d"), // id = 122
    dataDelete.copy("data", "g"));
    DeleteFile eqDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema);
    List<Pair<CharSequence, Long>> deletes = Lists.newArrayList(// id = 89
    Pair.of(dataFile.path(), 3L), // id = 121
    Pair.of(dataFile.path(), 5L));
    Pair<DeleteFile, Set<CharSequence>> posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), deletes);
    table.newRowDelta().addDeletes(eqDeletes).addDeletes(posDeletes.first()).validateDataFilesExist(posDeletes.second()).commit();
    StructLikeSet expected = rowSetWithoutIds(29, 89, 121, 122);
    StructLikeSet actual = rowSet(tableName, table, "*");
    Assert.assertEquals("Table should contain expected rows", expected, actual);
}
Also used : StructLikeSet(org.apache.iceberg.util.StructLikeSet) Set(java.util.Set) Schema(org.apache.iceberg.Schema) StructLikeSet(org.apache.iceberg.util.StructLikeSet) DeleteFile(org.apache.iceberg.DeleteFile) Pair(org.apache.iceberg.util.Pair) Test(org.junit.Test)

Example 84 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class DeleteReadTests method testEqualityDeletesSpanningMultipleDataFiles.

@Test
public void testEqualityDeletesSpanningMultipleDataFiles() throws IOException {
    // Add another DataFile with common values
    GenericRecord record = GenericRecord.create(table.schema());
    records.add(record.copy("id", 144, "data", "a"));
    this.dataFile = FileHelpers.writeDataFile(table, Files.localOutput(temp.newFile()), Row.of(0), records);
    table.newAppend().appendFile(dataFile).commit();
    Schema deleteRowSchema = table.schema().select("data");
    Record dataDelete = GenericRecord.create(deleteRowSchema);
    List<Record> dataDeletes = Lists.newArrayList(// id = 29, 144
    dataDelete.copy("data", "a"), // id = 89
    dataDelete.copy("data", "d"), // id = 122
    dataDelete.copy("data", "g"));
    DeleteFile eqDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, deleteRowSchema);
    table.newRowDelta().addDeletes(eqDeletes).commit();
    StructLikeSet expected = rowSetWithoutIds(29, 89, 122, 144);
    StructLikeSet actual = rowSet(tableName, table, "*");
    Assert.assertEquals("Table should contain expected rows", expected, actual);
}
Also used : Schema(org.apache.iceberg.Schema) StructLikeSet(org.apache.iceberg.util.StructLikeSet) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 85 with Schema

use of org.apache.iceberg.Schema in project hive by apache.

the class DeleteReadTests method testMultipleEqualityDeleteSchemas.

@Test
public void testMultipleEqualityDeleteSchemas() throws IOException {
    Schema dataSchema = table.schema().select("data");
    Record dataDelete = GenericRecord.create(dataSchema);
    List<Record> dataDeletes = Lists.newArrayList(// id = 29
    dataDelete.copy("data", "a"), // id = 89
    dataDelete.copy("data", "d"), // id = 122
    dataDelete.copy("data", "g"));
    DeleteFile dataEqDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), dataDeletes, dataSchema);
    Schema idSchema = table.schema().select("id");
    Record idDelete = GenericRecord.create(idSchema);
    List<Record> idDeletes = Lists.newArrayList(// id = 121
    idDelete.copy("id", 121), // id = 29
    idDelete.copy("id", 29));
    DeleteFile idEqDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), idDeletes, idSchema);
    table.newRowDelta().addDeletes(dataEqDeletes).addDeletes(idEqDeletes).commit();
    StructLikeSet expected = rowSetWithoutIds(29, 89, 121, 122);
    StructLikeSet actual = rowSet(tableName, table, "*");
    Assert.assertEquals("Table should contain expected rows", expected, actual);
}
Also used : Schema(org.apache.iceberg.Schema) StructLikeSet(org.apache.iceberg.util.StructLikeSet) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Aggregations

Schema (org.apache.iceberg.Schema)126 Test (org.junit.Test)93 Record (org.apache.iceberg.data.Record)68 Table (org.apache.iceberg.Table)55 PartitionSpec (org.apache.iceberg.PartitionSpec)39 GenericRecord (org.apache.iceberg.data.GenericRecord)36 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)30 List (java.util.List)21 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)20 IOException (java.io.IOException)16 Types (org.apache.iceberg.types.Types)16 ArrayList (java.util.ArrayList)15 Map (java.util.Map)14 HashMap (java.util.HashMap)13 FileFormat (org.apache.iceberg.FileFormat)13 UpdateSchema (org.apache.iceberg.UpdateSchema)12 Path (org.apache.hadoop.fs.Path)11 Collectors (java.util.stream.Collectors)10 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)10 TestHelper (org.apache.iceberg.mr.TestHelper)9