Search in sources :

Example 1 with DeleteFile

use of org.apache.iceberg.DeleteFile in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2UnpartitionedWithEqDelete.

@Test
public void testReadAndWriteFormatV2UnpartitionedWithEqDelete() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, PartitionSpec.unpartitioned(), fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // delete one of the rows
    List<Record> toDelete = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(1L, "Bob", null).build();
    DeleteFile deleteFile = HiveIcebergTestUtils.createEqualityDeleteFile(tbl, "dummyPath", ImmutableList.of("customer_id", "first_name"), fileFormat, toDelete);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id");
    // only the other two rows are present
    Assert.assertEquals(2, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(1));
}
Also used : Table(org.apache.iceberg.Table) Record(org.apache.iceberg.data.Record) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 2 with DeleteFile

use of org.apache.iceberg.DeleteFile in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied.

@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowSupplied() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // add some more data to the same partition
    shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
    tbl.refresh();
    // delete the first and third rows from the newly-added data file
    DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
    List<Record> rowsToDel = TestHelper.RecordsBuilder.newInstance(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).add(0L, "Laura", "Yellow").add(0L, "Blake", "Blue").build();
    List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, rowsToDel.get(0)), positionDelete(dataFile.path(), 2L, rowsToDel.get(1)));
    DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
    Assert.assertEquals(4, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
    Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
    Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
Also used : DataFile(org.apache.iceberg.DataFile) Types(org.apache.iceberg.types.Types) ImmutableMap(org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap) Table(org.apache.iceberg.Table) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) IOException(java.io.IOException) Test(org.junit.Test) TestHelper(org.apache.iceberg.mr.TestHelper) ImmutableList(org.apache.iceberg.relocated.com.google.common.collect.ImmutableList) Schema(org.apache.iceberg.Schema) FileFormat(org.apache.iceberg.FileFormat) List(java.util.List) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) StreamSupport(java.util.stream.StreamSupport) DeleteFile(org.apache.iceberg.DeleteFile) Assume(org.junit.Assume) DataFile(org.apache.iceberg.DataFile) Assert(org.junit.Assert) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Table(org.apache.iceberg.Table) PositionDelete(org.apache.iceberg.deletes.PositionDelete) Record(org.apache.iceberg.data.Record) PartitionSpec(org.apache.iceberg.PartitionSpec) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 3 with DeleteFile

use of org.apache.iceberg.DeleteFile in project hive by apache.

the class DeleteReadTests method testPositionDeletes.

@Test
public void testPositionDeletes() throws IOException {
    List<Pair<CharSequence, Long>> deletes = Lists.newArrayList(// id = 29
    Pair.of(dataFile.path(), 0L), // id = 89
    Pair.of(dataFile.path(), 3L), // id = 122
    Pair.of(dataFile.path(), 6L));
    Pair<DeleteFile, Set<CharSequence>> posDeletes = FileHelpers.writeDeleteFile(table, Files.localOutput(temp.newFile()), Row.of(0), deletes);
    table.newRowDelta().addDeletes(posDeletes.first()).validateDataFilesExist(posDeletes.second()).commit();
    StructLikeSet expected = rowSetWithoutIds(29, 89, 122);
    StructLikeSet actual = rowSet(tableName, table, "*");
    Assert.assertEquals("Table should contain expected rows", expected, actual);
}
Also used : StructLikeSet(org.apache.iceberg.util.StructLikeSet) Set(java.util.Set) StructLikeSet(org.apache.iceberg.util.StructLikeSet) Pair(org.apache.iceberg.util.Pair) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Example 4 with DeleteFile

use of org.apache.iceberg.DeleteFile in project hive by apache.

the class HiveIcebergTestUtils method createEqualityDeleteFile.

/**
 * @param table The table to create the delete file for
 * @param deleteFilePath The path where the delete file should be created, relative to the table location root
 * @param equalityFields List of field names that should play a role in the equality check
 * @param fileFormat The file format that should be used for writing out the delete file
 * @param rowsToDelete The rows that should be deleted. It's enough to fill out the fields that are relevant for the
 *                     equality check, as listed in equalityFields, the rest of the fields are ignored
 * @return The DeleteFile created
 * @throws IOException If there is an error during DeleteFile write
 */
public static DeleteFile createEqualityDeleteFile(Table table, String deleteFilePath, List<String> equalityFields, FileFormat fileFormat, List<Record> rowsToDelete) throws IOException {
    List<Integer> equalityFieldIds = equalityFields.stream().map(id -> table.schema().findField(id).fieldId()).collect(Collectors.toList());
    Schema eqDeleteRowSchema = table.schema().select(equalityFields.toArray(new String[] {}));
    FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), ArrayUtil.toIntArray(equalityFieldIds), eqDeleteRowSchema, null);
    EncryptedOutputFile outputFile = table.encryption().encrypt(HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(table.location(), deleteFilePath), new Configuration()));
    PartitionKey part = new PartitionKey(table.spec(), eqDeleteRowSchema);
    part.partition(rowsToDelete.get(0));
    EqualityDeleteWriter<Record> eqWriter = appenderFactory.newEqDeleteWriter(outputFile, fileFormat, part);
    try (EqualityDeleteWriter<Record> writer = eqWriter) {
        writer.deleteAll(rowsToDelete);
    }
    return eqWriter.toDeleteFile();
}
Also used : Arrays(java.util.Arrays) HadoopOutputFile(org.apache.iceberg.hadoop.HadoopOutputFile) Types(org.apache.iceberg.types.Types) Text(org.apache.hadoop.io.Text) NestedField.optional(org.apache.iceberg.types.Types.NestedField.optional) DateWritable(org.apache.hadoop.hive.serde2.io.DateWritable) StandardStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector) JobID(org.apache.hadoop.mapred.JobID) LongWritable(org.apache.hadoop.io.LongWritable) ByteBuffer(java.nio.ByteBuffer) BigDecimal(java.math.BigDecimal) TimestampUtils(org.apache.hadoop.hive.common.type.TimestampUtils) ArrayUtil(org.apache.iceberg.util.ArrayUtil) ByteBuffers(org.apache.iceberg.util.ByteBuffers) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) GenericRecord(org.apache.iceberg.data.GenericRecord) PositionDeleteWriter(org.apache.iceberg.deletes.PositionDeleteWriter) LocalTime(java.time.LocalTime) PartitionKey(org.apache.iceberg.PartitionKey) ZoneOffset(java.time.ZoneOffset) Path(java.nio.file.Path) IntWritable(org.apache.hadoop.io.IntWritable) CloseableIterable(org.apache.iceberg.io.CloseableIterable) Timestamp(java.sql.Timestamp) UUID(java.util.UUID) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) List(java.util.List) OffsetDateTime(java.time.OffsetDateTime) BooleanWritable(org.apache.hadoop.io.BooleanWritable) EncryptedOutputFile(org.apache.iceberg.encryption.EncryptedOutputFile) LocalDate(java.time.LocalDate) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory) PositionDelete(org.apache.iceberg.deletes.PositionDelete) LocalDateTime(java.time.LocalDateTime) IcebergGenerics(org.apache.iceberg.data.IcebergGenerics) DoubleWritable(org.apache.hadoop.io.DoubleWritable) ArrayList(java.util.ArrayList) BytesWritable(org.apache.hadoop.io.BytesWritable) TimestampWritable(org.apache.hadoop.hive.serde2.io.TimestampWritable) Files(java.nio.file.Files) Table(org.apache.iceberg.Table) EqualityDeleteWriter(org.apache.iceberg.deletes.EqualityDeleteWriter) IOException(java.io.IOException) FileFormat(org.apache.iceberg.FileFormat) File(java.io.File) Record(org.apache.iceberg.data.Record) ObjectInspectorFactory(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory) Paths(java.nio.file.Paths) TimestampTZUtil(org.apache.hadoop.hive.common.type.TimestampTZUtil) PrimitiveObjectInspectorFactory(org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) FileAppenderFactory(org.apache.iceberg.io.FileAppenderFactory) DeleteFile(org.apache.iceberg.DeleteFile) Comparator(java.util.Comparator) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) Assert(org.junit.Assert) FloatWritable(org.apache.hadoop.io.FloatWritable) Path(java.nio.file.Path) Configuration(org.apache.hadoop.conf.Configuration) EncryptedOutputFile(org.apache.iceberg.encryption.EncryptedOutputFile) Schema(org.apache.iceberg.Schema) GenericAppenderFactory(org.apache.iceberg.data.GenericAppenderFactory) PartitionKey(org.apache.iceberg.PartitionKey) GenericRecord(org.apache.iceberg.data.GenericRecord) Record(org.apache.iceberg.data.Record)

Example 5 with DeleteFile

use of org.apache.iceberg.DeleteFile in project hive by apache.

the class TestHiveIcebergV2 method testReadAndWriteFormatV2Unpartitioned_PosDelete.

@Test
public void testReadAndWriteFormatV2Unpartitioned_PosDelete() throws IOException {
    Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
    Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, PartitionSpec.unpartitioned(), fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
    // delete one of the rows
    DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).findFirst().orElseThrow(() -> new RuntimeException("Did not find any data files for test table"));
    List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 2L, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS.get(2)));
    DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, null, deletes);
    tbl.newRowDelta().addDeletes(deleteFile).commit();
    List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id");
    // only the other two rows are present
    Assert.assertEquals(2, objects.size());
    Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
    Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(1));
}
Also used : DataFile(org.apache.iceberg.DataFile) Table(org.apache.iceberg.Table) PositionDelete(org.apache.iceberg.deletes.PositionDelete) DeleteFile(org.apache.iceberg.DeleteFile) Test(org.junit.Test)

Aggregations

DeleteFile (org.apache.iceberg.DeleteFile)14 Test (org.junit.Test)13 Schema (org.apache.iceberg.Schema)10 Table (org.apache.iceberg.Table)7 StructLikeSet (org.apache.iceberg.util.StructLikeSet)7 Record (org.apache.iceberg.data.Record)6 DataFile (org.apache.iceberg.DataFile)4 PartitionSpec (org.apache.iceberg.PartitionSpec)4 PositionDelete (org.apache.iceberg.deletes.PositionDelete)4 IOException (java.io.IOException)3 List (java.util.List)3 FileFormat (org.apache.iceberg.FileFormat)3 Types (org.apache.iceberg.types.Types)3 NestedField.optional (org.apache.iceberg.types.Types.NestedField.optional)3 Assert (org.junit.Assert)3 Set (java.util.Set)2 StreamSupport (java.util.stream.StreamSupport)2 File (java.io.File)1 BigDecimal (java.math.BigDecimal)1 ByteBuffer (java.nio.ByteBuffer)1