Search in sources :

Example 46 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_EqDelete_OnlyEqColumnsSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_EqDelete_OnlyEqColumnsSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add one more row to the same partition
    shell.executeStatement("insert into customers values (1, 'Bob', 'Hoover')");
    // delete all rows with id=1 and first_name=Bob
    Schema shorterSchema = new Schema(optional(1, "id", Types.LongType.get()), optional(2, "name", Types.StringType.get()));
    List<Record> toDelete = TestHelper.RecordsBuilder.newInstance(shorterSchema).add(1L, "Bob").build();
    DeleteFile deleteFile = HiveIcebergTestUtils.createEqualityDeleteFile(tbl, "dummyPath", ImmutableList.of("customer_id", "first_name"), fileFormat, toDelete);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id");
    Assert.assertEquals(2, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(1));
}
Also used : Table(org.apache.iceberg.Table) Schema(org.apache.iceberg.Schema) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 47 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add some more data to the same partition
    shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
    tbl.refresh();
    // delete the first and third rows from the newly-added data file - with row supplied
    DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
    List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, null), positionDelete(dataFile.path(), 2L, null));
    DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
    Assert.assertEquals(4, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
    Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
Also used : DataFile(org.apache.iceberg.DataFile) Types(org.apache.iceberg.types.Types) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) StreamSupport(java.util.stream.StreamSupport) DeleteFile(org.apache.iceberg.DeleteFile) Assume(org.junit.Assume) DataFile(org.apache.iceberg.DataFile) Assert(org.junit.Assert) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Table(org.apache.iceberg.Table) PositionDelete(org.apache.iceberg.deletes.PositionDelete) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 48 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_EqDelete_AllColumnsSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_EqDelete_AllColumnsSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add one more row to the same partition
    shell.executeStatement("insert into customers values (1, 'Bob', 'Hoover')");
    // delete all rows with id=1 and first_name=Bob
    List<Record> toDelete = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(1L, "Bob", null).build();
    DeleteFile deleteFile = HiveIcebergTestUtils.createEqualityDeleteFile(tbl, "dummyPath", ImmutableList.of("customer_id", "first_name"), fileFormat, toDelete);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id");
    Assert.assertEquals(2, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(1));
}
Also used : Table(org.apache.iceberg.Table) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 49 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergInserts method testInsertOverwritePartitionedTable.

@Test
public void testInsertOverwritePartitionedTable() throws IOException {
    TableIdentifier target = TableIdentifier.of("default", "target");
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("last_name").build();
    Table table = testTables.createTable(shell, target.name(), HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, ImmutableList.of());
    // IOW into empty target table -> whole source result set is inserted
    List<Record> expected = new ArrayList<>(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS);
    expected.add(TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(8L, "Sue", "Green").build().get(// add one more to 'Green' so we have a partition w/ multiple records
    0));
    shell.executeStatement(testTables.getInsertQuery(expected, target, true));
    HiveIcebergTestUtils.validateData(table, expected, 0);
    // IOW into non-empty target table -> only the affected partitions are overwritten
    List<Record> newRecords = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Mike", // overwritten
    "Brown").add(1L, "Christy", // overwritten (partition has this single record now)
    "Green").add(3L, "Bill", // appended (new partition)
    "Purple").build();
    shell.executeStatement(testTables.getInsertQuery(newRecords, target, true));
    expected = new ArrayList<>(newRecords);
    // existing, untouched partition ('Pink')
    expected.add(HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2));
    HiveIcebergTestUtils.validateData(table, expected, 0);
    // IOW empty source result set -> has no effect on partitioned table
    shell.executeStatement("INSERT OVERWRITE TABLE target SELECT * FROM target WHERE FALSE");
    HiveIcebergTestUtils.validateData(table, expected, 0);
}
Also used : TableIdentifier(org.apache.iceberg.catalog.TableIdentifier) Table(org.apache.iceberg.Table) ArrayList(java.util.ArrayList) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) Test(org.junit.Test)

Example 50 with PartitionSpec

use of org.apache.iceberg.PartitionSpec in project hive by apache.

the class TestHiveIcebergOutputCommitter method writeRecords.

/**
 * Write random records to the given table using separate {@link HiveIcebergOutputCommitter} and
 * a separate {@link HiveIcebergRecordWriter} for every task.
 * @param name The name of the table to get the table object from the conf
 * @param taskNum The number of tasks in the job handled by the committer
 * @param attemptNum The id used for attempt number generation
 * @param commitTasks If <code>true</code> the tasks will be committed
 * @param abortTasks If <code>true</code> the tasks will be aborted - needed so we can simulate no commit/no abort
 *                   situation
 * @param conf The job configuration
 * @param committer The output committer that should be used for committing/aborting the tasks
 * @return The random generated records which were appended to the table
 * @throws IOException Propagating {@link HiveIcebergRecordWriter} exceptions
 */
private List<Record> writeRecords(String name, int taskNum, int attemptNum, boolean commitTasks, boolean abortTasks, JobConf conf, OutputCommitter committer) throws IOException {
    List<Record> expected = new ArrayList<>(RECORD_NUM * taskNum);
    Table table = HiveIcebergStorageHandler.table(conf, name);
    FileIO io = table.io();
    Schema schema = HiveIcebergStorageHandler.schema(conf);
    PartitionSpec spec = table.spec();
    for (int i = 0; i < taskNum; ++i) {
        List<Record> records = TestHelper.generateRandomRecords(schema, RECORD_NUM, i + attemptNum);
        TaskAttemptID taskId = new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.MAP, i, attemptNum);
        int partitionId = taskId.getTaskID().getId();
        String operationId = QUERY_ID + "-" + JOB_ID;
        FileFormat fileFormat = FileFormat.PARQUET;
        OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, attemptNum).format(fileFormat).operationId(operationId).build();
        HiveFileWriterFactory hfwf = new HiveFileWriterFactory(table, fileFormat, schema, null, fileFormat, null, null, null, null);
        HiveIcebergRecordWriter testWriter = new HiveIcebergRecordWriter(schema, spec, fileFormat, hfwf, outputFileFactory, io, TARGET_FILE_SIZE, TezUtil.taskAttemptWrapper(taskId), conf.get(Catalogs.NAME));
        Container<Record> container = new Container<>();
        for (Record record : records) {
            container.set(record);
            testWriter.write(container);
        }
        testWriter.close(false);
        if (commitTasks) {
            committer.commitTask(new TaskAttemptContextImpl(conf, taskId));
            expected.addAll(records);
        } else if (abortTasks) {
            committer.abortTask(new TaskAttemptContextImpl(conf, taskId));
        }
    }
    return expected;
}
Also used : OutputFileFactory(org.apache.iceberg.io.OutputFileFactory) Table(org.apache.iceberg.Table) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) Schema(org.apache.iceberg.Schema) ArrayList(java.util.ArrayList) FileFormat(org.apache.iceberg.FileFormat) PartitionSpec(org.apache.iceberg.PartitionSpec) FileIO(org.apache.iceberg.io.FileIO) Container(org.apache.iceberg.mr.mapred.Container) TaskAttemptContextImpl(org.apache.hadoop.mapred.TaskAttemptContextImpl) Record(org.apache.iceberg.data.Record)

Aggregations

PartitionSpec (org.apache.iceberg.PartitionSpec)59 Test (org.junit.Test)39 Table (org.apache.iceberg.Table)38 Schema (org.apache.iceberg.Schema)37 Record (org.apache.iceberg.data.Record)19 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)18 List (java.util.List)10 FileFormat (org.apache.iceberg.FileFormat)9 ArrayList (java.util.ArrayList)8 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)8 ImmutableList (org.apache.iceberg.relocated.com.google.common.collect.ImmutableList)7 IOException (java.io.IOException)6 UpdateSchema (org.apache.iceberg.UpdateSchema)6 BaseTable (org.apache.iceberg.BaseTable)5 Path (org.apache.hadoop.fs.Path)4 PartitionField (org.apache.iceberg.PartitionField)4 Types (org.apache.iceberg.types.Types)4 HdfsContext (com.facebook.presto.hive.HdfsContext)3 PrestoException (com.facebook.presto.spi.PrestoException)3 Map (java.util.Map)3